In [307]:
import numpy as np
import pandas as pd
import wbdata
from sklearn.preprocessing import Imputer

In [220]:
columns = ['Country', 'Year', 'Status', 'Life_Expectancy', 
           'Adult_Mortality', 'Infant_Deaths', 'Alcohol', 
           'Percentage_Expenditure', 'Measles', 'BMI', 
           'Under-Five_Deaths ', 'Polio', 'Total_Expenditure', 
           'Diphtheria', 'HIV/AIDS', 'GDP', 'Population',
           'Thinness_1-19_years', 'Thinness_5-9_years',
           'Income_Composition_Of_Resources', 'Schooling']
main_data = pd.DataFrame(columns=columns)

In [221]:
main_data

Unnamed: 0,Country,Year,Status,Life_Expectancy,Adult_Mortality,Infant_Deaths,Alcohol,Percentage_Expenditure,Measles,BMI,...,Polio,Total_Expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness_1-19_years,Thinness_5-9_years,Income_Composition_Of_Resources,Schooling


## Population Data

In [222]:
#wbdata.search_indicators('Population')

In [223]:
#set up the indicator I want (just build up the dict if you want more than one)
indicators = {'SP.POP.TOTL':'Population'}

In [224]:
#grab indicators above for all countries and load into data frame
pop_df_in = wbdata.get_dataframe(indicators, convert_date=False)

In [225]:
pop_df_in.to_csv("Data/New/Population/Population_Internet.csv", encoding='utf-8')

In [226]:
pop_df_get = pd.read_csv("Data/New/Population/Population_Internet.csv")
pop_df = pop_df_get.rename(index = str, columns={'date':'Year', 'country':'Country'})
len(pop_df['Country'].unique())

264

In [227]:
# Removing continents and other trivial rows
pop_df.drop(pop_df.index[:2773], inplace=True)
len(pop_df['Country'].unique())

217

In [228]:
# Year: 2000 - 2014
pop_df = pop_df[(lambda x: x['Year'].isin(range(2000,2015)))]

In [229]:
pop_df.reset_index(drop=True, inplace=True)

In [240]:
pop_df.head()

Unnamed: 0,Country,Year,Population
0,Afghanistan,2014,32758020.0
1,Afghanistan,2013,31731688.0
2,Afghanistan,2012,30696958.0
3,Afghanistan,2011,29708599.0
4,Afghanistan,2010,28803167.0


In [231]:
# Fixing Missing Values
pop_df.at[915, 'Population'] = 5054634
pop_df.at[916, 'Population'] = 4945529
pop_df.at[917, 'Population'] = 4840901

In [237]:
main_data[['Country', 'Year', 'Population']] = pop_df[['Country', 'Year', 'Population']]

In [242]:
main_data['Population'].isnull().sum()

0

## GDP

In [245]:
#set up the indicator I want (just build up the dict if you want more than one)
indicators = {'NY.GDP.PCAP.CD':'GDP'}

In [246]:
#grab indicators above for countires above and load into data frame
gdp_df = wbdata.get_dataframe(indicators, convert_date=False)

In [273]:
gdp_df.to_csv("Data/New/GDP/GDP_Internet.csv", encoding='utf-8')
gdp_data_get = pd.read_csv("Data/New/GDP/GDP_Internet.csv")
gdp_data = gdp_data_get.rename(index = str, columns={'date':'Year', 'country':'Country'})
gdp_data[gdp_data['Country'] == "Afghanistan"]

Unnamed: 0,Country,Year,GDP
2773,Afghanistan,2018,
2774,Afghanistan,2017,550.068459
2775,Afghanistan,2016,549.58276
2776,Afghanistan,2015,590.076474
2777,Afghanistan,2014,625.339539
2778,Afghanistan,2013,647.96646
2779,Afghanistan,2012,648.51107
2780,Afghanistan,2011,599.29763
2781,Afghanistan,2010,550.514974
2782,Afghanistan,2009,444.184404


In [281]:
for val in main_data['Country'].index:
    print(val,' ',end='')
    if len(gdp_data[gdp_data['Country'] == main_data['Country'][val]].values) != 0:
        try:
            gdp_other_data = gdp_data[gdp_data['Country'] == main_data['Country'][val]][gdp_data['Year'] == main_data['Year'][val]]['GDP'].values[0]
            print(gdp_other_data, "pass: 1")
            main_data.at[val, 'GDP'] = gdp_other_data
        except IndexError:
            pass

0  625.3395388284999 pass: 1
1  647.966460473683 pass: 1
2  648.511069587633 pass: 1
3  599.29762975711 pass: 1
4  550.514973976336 pass: 1
5  444.18440407545 pass: 1
6  370.382293977374 pass: 1
7  366.230443242864 pass: 1
8  269.22969301812304 pass: 1
9  247.664139959638 pass: 1
10  216.708128851229 pass: 1
11  195.77663034155103 pass: 1
12  184.49471212204898 pass: 1
13  nan pass: 1
14  nan pass: 1
15  4578.6679344615895 pass: 1
16  4413.08288688408 pass: 1
17  4247.614342362921 pass: 1
18  4437.177794486521 pass: 1
19  4094.36020359235 pass: 1
20  4114.13489916342 pass: 1
21  4370.5399247769 pass: 1
22  3595.0380568289293 pass: 1
23  2972.74292399799 pass: 1
24  2673.78658429559 pass: 1
25  2373.58129170055 pass: 1
26  1846.12012081207 pass: 1
27  1425.12421860142 pass: 1
28  1281.6598256178 pass: 1
29  1126.68334010717 pass: 1
30  5466.42577841535 pass: 1
31  5471.123388787089 pass: 1
32  5565.134521048481 pass: 1
33  5432.413319781161 pass: 1
34  4463.39467488951 pass: 1
35  3868.

  """


8456.94799658994 pass: 1
49  8508.40691498359 pass: 1
50  8639.30885529158 pass: 1
51  8914.52543261668 pass: 1
52  8751.76652874972 pass: 1
53  nan pass: 1
54  nan pass: 1
55  11598.7517362051 pass: 1
56  11589.8530023324 pass: 1
57  11660.329531051999 pass: 1
58  10375.9942154736 pass: 1
59  10352.8227618312 pass: 1
60  42294.9947269717 pass: 1
61  40619.711297779504 pass: 1
62  38391.080866978504 pass: 1
63  41098.766941722795 pass: 1
64  39736.3540626699 pass: 1
65  43339.3798746543 pass: 1
66  47785.6590856793 pass: 1
67  48582.808455086604 pass: 1
68  43748.772158899905 pass: 1
69  41282.0201219785 pass: 1
70  38503.479614485695 pass: 1
71  32776.4422698769 pass: 1
72  24741.4935704562 pass: 1
73  22228.8464928922 pass: 1
74  21936.530101470802 pass: 1
75  5412.6923476178 pass: 1
76  5258.40737644433 pass: 1
77  5102.4899693158895 pass: 1
78  4615.86747457004 pass: 1
79  3585.90555256868 pass: 1
80  3117.89694392484 pass: 1
81  4068.97845646361 pass: 1
82  3108.26864317894 pass: 

326  66111.7252270036 pass: 1
327  62583.1002034588 pass: 1
328  58883.9594265967 pass: 1
329  56284.168647809405 pass: 1
330  2522.7949530209803 pass: 1
331  2350.8828892447304 pass: 1
332  2422.00801577828 pass: 1
333  2458.0459761545003 pass: 1
334  2178.92138307967 pass: 1
335  1770.23449659172 pass: 1
336  1795.1813072028199 pass: 1
337  1741.1425516889801 pass: 1
338  1335.4569058859802 pass: 1
339  1247.0613916089599 pass: 1
340  1097.45708762103 pass: 1
341  997.7417140733031 pass: 1
342  885.638224530948 pass: 1
343  807.938767268667 pass: 1
344  765.8632357656951 pass: 1
345  3124.00030982516 pass: 1
346  2947.9385262805104 pass: 1
347  2645.2277526014605 pass: 1
348  2377.68877123935 pass: 1
349  1981.1701161847602 pass: 1
350  1776.86647562189 pass: 1
351  1736.93008403587 pass: 1
352  1389.62934991949 pass: 1
353  1233.59186917076 pass: 1
354  1046.4273841431698 pass: 1
355  978.334648481852 pass: 1
356  917.3643104861699 pass: 1
357  913.575642272268 pass: 1
358  958.2366

620  3838.43397176904 pass: 1
621  3471.2480543115003 pass: 1
622  2695.36591709669 pass: 1
623  2099.22943460447 pass: 1
624  1753.41782925823 pass: 1
625  1508.6680978826598 pass: 1
626  1288.64325183381 pass: 1
627  1148.5082904417 pass: 1
628  1053.1082430045199 pass: 1
629  959.372483639691 pass: 1
630  7974.406373025029 pass: 1
631  8065.960376234019 pass: 1
632  7904.495788647009 pass: 1
633  7207.0289047302795 pass: 1
634  6230.73836562837 pass: 1
635  5119.19164052311 pass: 1
636  5383.13269367517 pass: 1
637  4635.69227663314 pass: 1
638  3677.1340415731897 pass: 1
639  3354.0218037305895 pass: 1
640  2740.24944202769 pass: 1
641  2246.2574724306 pass: 1
642  2355.72585669892 pass: 1
643  2395.8565512517102 pass: 1
644  2472.19783098297 pass: 1
645  1511.82603748603 pass: 1
646  1504.93100711702 pass: 1
647  1403.4343352935598 pass: 1
648  1446.06801802192 pass: 1
649  1315.21480648394 pass: 1
650  1339.33719065958 pass: 1
651  1387.59462639938 pass: 1
652  1238.46151380518 p

914  1702.68490100276 pass: 1
915  nan pass: 1
916  nan pass: 1
917  nan pass: 1
918  582.775530237486 pass: 1
919  482.1490904235 pass: 1
920  430.754449938105 pass: 1
921  326.082564151512 pass: 1
922  317.329433586554 pass: 1
923  297.828058802925 pass: 1
924  276.75080964612897 pass: 1
925  287.422224311551 pass: 1
926  232.794545914417 pass: 1
927  201.768798115624 pass: 1
928  215.13921025180198 pass: 1
929  208.19694865228297 pass: 1
930  19949.5813766971 pass: 1
931  19072.2385175669 pass: 1
932  17421.890222737802 pass: 1
933  17454.843424643503 pass: 1
934  14638.6048173457 pass: 1
935  14726.318278058701 pass: 1
936  18094.548052783102 pass: 1
937  16586.4052048847 pass: 1
938  12595.4106486304 pass: 1
939  10338.3132235799 pass: 1
940  8850.46511484791 pass: 1
941  7174.2374147336905 pass: 1
942  5308.34778059328 pass: 1
943  4498.95702743146 pass: 1
944  4070.0328269871397 pass: 1
945  3379.8964990926097 pass: 1
946  3587.0003161096297 pass: 1
947  3864.76043619193 pass: 1

1204  2825.51902943213 pass: 1
1205  2635.7588738299 pass: 1
1206  2794.2261567238697 pass: 1
1207  2489.9558363193 pass: 1
1208  2256.56723965573 pass: 1
1209  2077.83438001719 pass: 1
1210  1872.73708329482 pass: 1
1211  1753.34947067681 pass: 1
1212  1701.77149170704 pass: 1
1213  1568.37630826336 pass: 1
1214  1655.5877302334698 pass: 1
1215  743.5912878392951 pass: 1
1216  726.089394240361 pass: 1
1217  677.0434992516861 pass: 1
1218  614.8647662128959 pass: 1
1219  634.923105379227 pass: 1
1220  636.279956848232 pass: 1
1221  674.6181787457621 pass: 1
1222  622.173656837936 pass: 1
1223  427.0657265294 pass: 1
1224  303.42449798582 pass: 1
1225  386.328828253397 pass: 1
1226  370.193178116151 pass: 1
1227  322.811170974089 pass: 1
1228  315.839800323726 pass: 1
1229  340.051691750487 pass: 1
1230  610.4684902121691 pass: 1
1231  621.940673617931 pass: 1
1232  603.933983492923 pass: 1
1233  688.141239473375 pass: 1
1234  545.704120821919 pass: 1
1235  544.861280100724 pass: 1
1236

1470  38109.4121125573 pass: 1
1471  40454.447457890295 pass: 1
1472  48603.4766497749 pass: 1
1473  48167.9972684965 pass: 1
1474  44507.6763859172 pass: 1
1475  40855.1756354596 pass: 1
1476  39339.2975731826 pass: 1
1477  35275.228431266696 pass: 1
1478  35433.988963742995 pass: 1
1479  37217.648727917 pass: 1
1480  37688.7223359406 pass: 1
1481  34808.3909176613 pass: 1
1482  32289.350536072598 pass: 1
1483  33846.4656414342 pass: 1
1484  38532.0408752935 pass: 1
1485  4066.9407754308795 pass: 1
1486  3992.8671070526793 pass: 1
1487  3870.75321121669 pass: 1
1488  3807.3241449185402 pass: 1
1489  3679.1902746327796 pass: 1
1490  3492.13090643818 pass: 1
1491  3385.60966483112 pass: 1
1492  2762.8063541248102 pass: 1
1493  2537.30048980654 pass: 1
1494  2203.0837873545 pass: 1
1495  2061.45688205636 pass: 1
1496  1889.21396186775 pass: 1
1497  1812.2883744492801 pass: 1
1498  1728.26050900954 pass: 1
1499  1657.8892563121399 pass: 1
1500  12807.2606866152 pass: 1
1501  13890.6317716

1808  297.70959689919204 pass: 1
1809  280.367384226713 pass: 1
1810  274.225629402118 pass: 1
1811  260.105251697695 pass: 1
1812  290.979897687755 pass: 1
1813  146.761539725235 pass: 1
1814  153.25950867537202 pass: 1
1815  11183.729432082 pass: 1
1816  10882.2891042102 pass: 1
1817  10779.5075072926 pass: 1
1818  10405.120619133 pass: 1
1819  9071.35698672103 pass: 1
1820  7326.744434864591 pass: 1
1821  8513.6295414071 pass: 1
1822  7269.171140420141 pass: 1
1823  6222.9829548197495 pass: 1
1824  5593.82299540355 pass: 1
1825  4955.47774271198 pass: 1
1826  4463.67589342594 pass: 1
1827  4167.36438653514 pass: 1
1828  3915.1150459563796 pass: 1
1829  4045.17047135901 pass: 1
1830  9056.65343979878 pass: 1
1831  8291.485294941971 pass: 1
1832  7473.195629491079 pass: 1
1833  7395.68779981626 pass: 1
1834  7100.405899101809 pass: 1
1835  6615.76377781727 pass: 1
1836  6583.45124965947 pass: 1
1837  5559.50683172405 pass: 1
1838  4811.66746787284 pass: 1
1839  3648.7800546362396 pass

2118  nan pass: 1
2119  nan pass: 1
2120  nan pass: 1
2121  nan pass: 1
2122  nan pass: 1
2123  nan pass: 1
2124  nan pass: 1
2125  nan pass: 1
2126  nan pass: 1
2127  nan pass: 1
2128  nan pass: 1
2129  12579.595105586399 pass: 1
2130  44560.640369404806 pass: 1
2131  42949.326841702205 pass: 1
2132  39970.256244503704 pass: 1
2133  38426.5508077771 pass: 1
2134  33692.0108346543 pass: 1
2135  28201.2322839936 pass: 1
2136  31287.7786475125 pass: 1
2137  32510.082699939197 pass: 1
2138  26671.3294177179 pass: 1
2139  27750.7248353584 pass: 1
2140  25420.070413486497 pass: 1
2141  21913.708171996102 pass: 1
2142  16874.1874918196 pass: 1
2143  13882.8568268586 pass: 1
2144  13640.988889806202 pass: 1
2145  1975.46470748978 pass: 1
2146  1847.1980486856799 pass: 1
2147  1792.0380448462 pass: 1
2148  1682.95792434336 pass: 1
2149  1526.4979380617801 pass: 1
2150  1464.4977535686999 pass: 1
2151  1518.8053855473001 pass: 1
2152  1344.30187125334 pass: 1
2153  1240.99261674457 pass: 1
2154

2469  66987.4009245707 pass: 1
2470  65035.9604988093 pass: 1
2471  56168.0166796035 pass: 1
2472  44576.2625659129 pass: 1
2473  41806.2963118756 pass: 1
2474  40188.8510401312 pass: 1
2475  1824.37921729398 pass: 1
2476  1619.52851686126 pass: 1
2477  1380.94995917229 pass: 1
2478  1304.32618929166 pass: 1
2479  1129.75496146547 pass: 1
2480  1099.5710457235 pass: 1
2481  1126.46210206395 pass: 1
2482  886.6695898316029 pass: 1
2483  836.795284158117 pass: 1
2484  804.127987385206 pass: 1
2485  687.548404467755 pass: 1
2486  643.949065739559 pass: 1
2487  551.203852227072 pass: 1
2488  505.78718153537596 pass: 1
2489  nan pass: 1
2490  24575.4030378327 pass: 1
2491  24934.3861423166 pass: 1
2492  25303.094621302997 pass: 1
2493  23770.747386207204 pass: 1
2494  19259.5872568465 pass: 1
2495  16094.2930975756 pass: 1
2496  20037.8323388242 pass: 1
2497  16472.166046674898 pass: 1
2498  15334.6699396085 pass: 1
2499  13739.8294463775 pass: 1
2500  11138.807464899699 pass: 1
2501  9567.

2841  nan pass: 1
2842  2058.0352093943497 pass: 1
2843  1762.24610660537 pass: 1
2844  1577.4571821855802 pass: 1
2845  1408.85273756165 pass: 1
2846  1253.39140303912 pass: 1
2847  1263.01345505567 pass: 1
2848  1258.42195289081 pass: 1
2849  1177.62926773348 pass: 1
2850  1104.45901890381 pass: 1
2851  1040.13741165493 pass: 1
2852  954.720525255865 pass: 1
2853  834.541295306236 pass: 1
2854  738.347522660006 pass: 1
2855  666.345856945147 pass: 1
2856  706.087584631519 pass: 1
2857  520.037186594333 pass: 1
2858  404.289405943386 pass: 1
2859  337.360396992708 pass: 1
2860  309.28019842849 pass: 1
2861  236.49018001025902 pass: 1
2862  189.3945514973 pass: 1
2863  170.808503315275 pass: 1
2864  138.43190810128598 pass: 1
2865  950.8183027927021 pass: 1
2866  903.361862058071 pass: 1
2867  820.1562898037449 pass: 1
2868  733.4128065416071 pass: 1
2869  701.604550062032 pass: 1
2870  658.757952665891 pass: 1
2871  651.262438712841 pass: 1
2872  528.063562048951 pass: 1
2873  471.527

3182  37849.7287205036 pass: 1
3183  39144.1657740184 pass: 1
3184  40043.1901659314 pass: 1
3185  38771.2743877127 pass: 1
3186  39207.0037546472 pass: 1
3187  44333.065655027305 pass: 1
3188  41560.9341982634 pass: 1
3189  40929.7951205119 pass: 1
3190  35024.477490849706 pass: 1
3191  31823.124988479904 pass: 1
3192  30126.255644641 pass: 1
3193  nan pass: 1
3194  nan pass: 1
3195  2960.77800405245 pass: 1
3196  2992.20099455427 pass: 1
3197  2787.16973802917 pass: 1
3198  2664.95138463952 pass: 1
3199  2338.71987682303 pass: 1
3200  1963.20151865719 pass: 1
3201  1855.45702045882 pass: 1
3202  1575.56340027289 pass: 1
3203  1441.46170046742 pass: 1
3204  1455.1878751811498 pass: 1
3205  1337.56572430673 pass: 1
3206  1257.69857009689 pass: 1
3207  1156.21747345769 pass: 1
3208  1335.55319529359 pass: 1
3209  1476.17185000238 pass: 1
3210  1647.03365674887 pass: 1
3211  1580.1816006295498 pass: 1
3212  1421.1715746938698 pass: 1
3213  1349.42024978639 pass: 1
3214  1309.23195973585 

In [282]:
main_data[main_data['GDP'].isnull()]['Country'].unique()

array(['Afghanistan', 'American Samoa', 'Bermuda',
       'British Virgin Islands', 'Cayman Islands', 'Channel Islands',
       'Curacao', 'Eritrea', 'French Polynesia', 'Gibraltar', 'Guam',
       'Iraq', 'Korea, Dem. People���s Rep.', 'Nauru', 'New Caledonia',
       'Northern Mariana Islands', 'Sao Tome and Principe',
       'Sint Maarten (Dutch part)', 'Somalia', 'South Sudan',
       'St. Martin (French part)', 'Syrian Arab Republic',
       'Turks and Caicos Islands', 'Virgin Islands (U.S.)'], dtype=object)

In [283]:
#main_data[main_data['GDP'].isnull()][main_data['Country'] == "Afghanistan"][['Country','Year','GDP']]

In [331]:
main_data[main_data['Country'] == "Eritrea"][['Country', 'Year', 'GDP']]

Unnamed: 0,Country,Year,GDP
915,Eritrea,2014,
916,Eritrea,2013,
917,Eritrea,2012,
918,Eritrea,2011,582.776
919,Eritrea,2010,482.149
920,Eritrea,2009,430.754
921,Eritrea,2008,326.083
922,Eritrea,2007,317.329
923,Eritrea,2006,297.828
924,Eritrea,2005,276.751


In [323]:
# af_test = main_data[main_data['Country'] == "Afghanistan"]

In [324]:
# DEALING WITH MISSING VALUES
# imp=Imputer(missing_values="NaN", strategy="most_frequent" )
# imp.fit(af_test[["GDP"]])
# af_test["GDP"]=imp.transform(af_test[["GDP"]]).ravel()

In [325]:
# af_test["GDP"]