In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns



In [14]:
import folium 
from folium import plugins
import ipywidgets
import geocoder
import geopy
from vega_datasets import data as vds

In [15]:
training_values = pd.read_csv("../data/training_set_values.csv")
training_labels = pd.read_csv("../data/training_set_labels.csv")

In [16]:
# this one only has functional and non functional
training_labels['id'] == 59397

0        False
1        False
2        False
3        False
4        False
         ...  
59395    False
59396    False
59397    False
59398    False
59399    False
Name: id, Length: 59400, dtype: bool

In [95]:
training_values['region'].value_counts().head(60)

Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64

In [18]:
# we are going to drop all of these columns 
tra_v_dropped = training_values.drop(['installer', 'date_recorded', 'wpt_name', 
                                      'funder', 'population', 'extraction_type', 
                                      'construction_year', 'extraction_type_class', 
                                      'waterpoint_type_group','num_private', 'public_meeting',
                                      'source_type', 'quantity_group', 'water_quality'], axis=1)
tra_v_dropped.head(3)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,basin,subvillage,region,region_code,district_code,...,extraction_type_group,management,management_group,payment,payment_type,quality_group,quantity,source,source_class,waterpoint_type
0,69572,6000.0,1390,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,11,5,...,gravity,vwc,user-group,pay annually,annually,good,enough,spring,groundwater,communal standpipe
1,8776,0.0,1399,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,20,2,...,gravity,wug,user-group,never pay,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe
2,34310,25.0,686,37.460664,-3.821329,Pangani,Majengo,Manyara,21,4,...,gravity,vwc,user-group,pay per bucket,per bucket,good,enough,dam,surface,communal standpipe multiple


In [79]:
# we are merging training_labels and training_v_dropped
df = pd.merge(tra_v_dropped, training_labels, on="id")
df.tail(60)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,basin,subvillage,region,region_code,district_code,...,management,management_group,payment,payment_type,quality_group,quantity,source,source_class,waterpoint_type,status_group
59340,69140,0.0,1006,37.412463,-3.331443,Pangani,Mpukoni,Kilimanjaro,3,4,...,vwc,user-group,unknown,unknown,good,insufficient,spring,groundwater,other,non functional
59341,71879,0.0,1785,38.220441,-4.617223,Pangani,Mahalae,Tanga,4,1,...,vwc,user-group,never pay,never pay,good,enough,spring,groundwater,communal standpipe,non functional
59342,20216,0.0,0,30.98836,-1.717181,Lake Victoria,Kafulo,Kagera,18,1,...,vwc,user-group,never pay,never pay,good,enough,spring,groundwater,communal standpipe,functional
59343,30877,0.0,751,37.358639,-3.429023,Pangani,Sanya Line A,Kilimanjaro,3,4,...,water authority,commercial,pay monthly,monthly,good,insufficient,spring,groundwater,other,non functional
59344,6450,0.0,1303,36.900911,-3.111477,Internal,Madukani,Arusha,2,7,...,unknown,unknown,unknown,unknown,unknown,unknown,spring,groundwater,communal standpipe,functional
59345,68576,600.0,962,29.729909,-4.801508,Lake Tanganyika,Kabanga,Kigoma,16,3,...,vwc,user-group,pay monthly,monthly,good,enough,river,surface,communal standpipe multiple,functional needs repair
59346,31471,0.0,1275,38.326284,-4.457878,Pangani,Kitui,Tanga,4,1,...,vwc,user-group,never pay,never pay,good,enough,spring,groundwater,communal standpipe,functional
59347,40167,0.0,1140,33.627314,-2.162634,Lake Victoria,Kati,Mara,20,4,...,wug,user-group,unknown,unknown,salty,enough,shallow well,groundwater,hand pump,non functional
59348,49581,1000.0,0,38.823804,-4.844484,Pangani,Mbuyuni,Tanga,4,8,...,private operator,commercial,pay monthly,monthly,good,insufficient,spring,groundwater,communal standpipe,functional
59349,54026,0.0,0,32.242568,-4.189353,Lake Tanganyika,Mbika A,Shinyanga,17,3,...,vwc,user-group,pay per bucket,per bucket,good,enough,spring,groundwater,communal standpipe multiple,non functional


In [20]:
coords = {"Chinamili":(-2.8732, 34.2205),
      "Nkololo":(-2.6419, 34.1601),
      "Somanda":(-3.3667, 33.9500),
      "Bumera":(-1.2844, 34.3251),
      "Kalangalala":(-2.8706, 32.2367),
      "Nkoma":(-3.5749, 34.3829),
      "Mkula":(-2.3000, 33.8833),
      "Nkungulu":(-3.0086, 33.4164),
      "Sakwe":(-2.7677, 33.8630),
      "Nyang'hwale":(-3.0856, 32.6277),
      "Kasamwa":(-2.8385, 32.4212),
      "Sapiwi":(-2.3684, 33.9655),
      "Lugulu":(-2.9141, 33.9583),
      "Mwamapalala":(-3.0165, 33.9138),
      "Kharumwa":(-3.2004, 32.6578),
      "Igalukilo":(-2.3759, 33.7987),
      "Kinang'weli":(-2.9438, 33.7441),
      "Kasoli":(-2.5695, 33.6712),
      "Nyakabindi":(-2.6410, 33.9903),
      "Zagayu":(-2.9655, 33.8096),
      "Mwadobana":(-2.5317, 34.0500),
      "Malili":(-2.4463, 33.8057),
      "Bunamhala":(-2.8534, 34.0623),
      "Lubanga":(-2.6320, 32.4018),
      "Nyaluhande":(-2.5004, 33.6108),
      "Mhango":(-2.7529, 33.9165),
      "Lagangabilili":(-2.9599, 34.1366),
      "Bukoli":(-3.1964, 32.3227),
      "Dutwa":(-2.5098, 33.9680),
      "Kakora":(-2.9657, 33.3346),
      "Mbita":(-2.8447, 33.7874),
      "Kalemela":(-2.2988, 33.7428),
      "Ikungulyabashashi":(-2.6080, 33.9491),
      "Busolwa":(-3.0024, 32.6309),
      "Mwingiro":(-3.1447, 32.5014),
      "Mwaswale":(-2.7824, 34.3522),
      "Kabita":(-2.3204, 33.6698),
      "Katoro":(-3.0062, 31.9271),
      "Gamboshi":(-2.6055, 33.7681),
      "Mhunze":(-3.6195, 33.8128),
      "Badugu":(-2.4667, 33.6500),
      "Mwananyili":(-6.788, 39.256),
      "Mwaubingi":(-2.5223, 34.1136),
      "Chigunga":(-2.8140, 31.9338),
      "Shigala":(-2.3924, 33.6763),
      "Bariadi":(-2.8070, 33.9917),
      "Kagu":(-2.7274, 32.0121),
      "Kiloleli":(-2.4909, 32.9206),
      "Nyachiluluma":(-2.7818, 31.9197),
      "Nzera":(-2.5214, 32.1561),
      "Bukwimba":(-3.3367, 32.6066),
      "Mtakuja":(-3.4868, 37.3573),
      "Nyamalimbe":(-3.0819, 32.3671),
      "Nyakamwaga":(-2.9873, 32.2308),
      "Kaseme":(-3.1270, 31.9495),
      "Kamena":(-3.0724, 32.3249),
      "Sagata":(-2.7446, 34.2375),
      "Ngasamo":(-2.4900, 33.8442),
      "Lwamgasa":(-3.1166, 32.0417),
      "Kafita":(-3.2349, 32.5852),
      "Nyakagomba":(-2.8697, 31.9421),
      "Ihanamilo":(-2.9501, 32.2677),
      "Kamhanga":(-2.7346, 32.3087),
      "Busanda":(-3.0187, 32.1119),
      "Nyugwa":(-3.2237, 32.7380),
      "Nkome":(-2.5009, 32.0083),
      "Shabaka":(-2.9942, 32.4536),
      "Bukondo":(-2.6712, 31.9189),
      "Senga":(-4.4000, 34.6667)}

In [21]:
#Sanity check
missing_coords = list(df.loc[df['longitude'] == 0]["ward"].value_counts().keys())
missing_coords == list(coords.keys())

False

In [22]:
# get latitude and longitude
def get_lat(x):
    return coords[x][0]
def get_long(x):
    return coords[x][1]

In [23]:
df_miss = df.loc[df['longitude'] == 0]

In [24]:
df.loc[(df.longitude == 0), 'latitude'] = df_miss["ward"].apply(get_lat)
df.loc[(df.longitude == 0), 'longitude'] = df_miss["ward"].apply(get_long)

In [26]:
lat = df['latitude']
lat

0        -9.856322
1        -2.147466
2        -3.821329
3       -11.155298
4        -1.825359
           ...    
59395    -3.253847
59396    -9.070629
59397    -8.750434
59398    -6.378573
59399    -6.747464
Name: latitude, Length: 59400, dtype: float64

In [45]:
list_lat = lat.values.tolist()
list_lat

[-9.85632177,
 -2.14746569,
 -3.82132853,
 -11.15529772,
 -1.82535885,
 -4.76558728,
 -3.76636472,
 -4.22619802,
 -5.14671181,
 -1.25705061,
 -7.03413939,
 -10.57417468,
 -3.2901937999999995,
 -3.1817833,
 -3.62933335,
 -8.27496163,
 -9.10618458,
 -9.08551497,
 -1.94786801,
 -9.8944125,
 -9.59498965,
 -2.608,
 -8.98001429,
 -4.12559468,
 -4.49556758,
 -3.31753648,
 -9.94253166,
 -1.73536124,
 -3.2635263,
 -3.19198926,
 -5.79099991,
 -6.63610419,
 -3.29062725,
 -9.71590954,
 -7.889986299999999,
 -3.29300336,
 -9.63851993,
 -4.6479063,
 -8.22439872,
 -3.89456056,
 -3.47684942,
 -9.20464923,
 -9.03250291,
 -4.62892053,
 -3.47855934,
 -3.83899753,
 -11.00060373,
 -9.17290468,
 -8.22756303,
 -9.18987766,
 -3.34929238,
 -8.57166114,
 -9.44812238,
 -3.0856,
 -1.68065404,
 -3.35653763,
 -3.36819761,
 -3.33981163,
 -3.80687858,
 -1.56388609,
 -5.11324963,
 -4.6472894999999985,
 -4.77975837,
 -3.35002587,
 -4.30776106,
 -5.17313618,
 -9.2035221,
 -1.2857798,
 -8.88959406,
 -4.47180947,
 -8.93495

In [46]:
long = df['longitude']
long

0        34.938093
1        34.698766
2        37.460664
3        38.486161
4        31.130847
           ...    
59395    37.169807
59396    35.249991
59397    34.017087
59398    35.861315
59399    38.104048
Name: longitude, Length: 59400, dtype: float64

In [47]:
list_long = long.values.tolist()
list_long

[34.93809275,
 34.6987661,
 37.46066446,
 38.48616088,
 31.13084671,
 39.1727956,
 33.36240982,
 32.62061707,
 32.71110001,
 30.62699053,
 39.20951812,
 35.77025785,
 33.79810612,
 37.09257412,
 34.36407268,
 31.44412134,
 34.64243884,
 34.56926611,
 32.92015381,
 34.50896732,
 34.47342985,
 33.9491,
 34.58690108,
 37.94002949,
 29.74706567,
 37.42275132,
 39.37077651,
 31.10425712,
 37.61112572,
 37.06168837,
 36.1083125,
 38.36355454,
 37.49233371,
 34.47835586,
 39.81291224,
 36.61869946,
 33.85434473,
 35.85837259,
 35.89026429,
 30.6133054,
 37.42002692,
 33.92790176,
 34.81457364,
 34.96778863,
 36.80607857,
 33.34526033,
 39.08741514,
 33.5406067,
 35.87141325,
 34.44128349,
 37.37285829,
 32.04683486,
 33.95816322,
 32.6277,
 30.98302258,
 37.34816881,
 37.47943592,
 36.72112734,
 35.56134607,
 31.45428168,
 38.78943655,
 38.37667127,
 29.66972029,
 36.94745079,
 34.76272634,
 34.62159801,
 34.89725735,
 30.71609028,
 34.55789311,
 29.76800862,
 35.81898147,
 34.46222769,
 34.2

In [48]:
status = df['status_group']
status

0            functional
1            functional
2            functional
3        non functional
4            functional
              ...      
59395        functional
59396        functional
59397        functional
59398        functional
59399        functional
Name: status_group, Length: 59400, dtype: object

In [88]:
lat_long = pd.concat([lat, long, status], axis=1)
first_hun = lat_long.iloc[1:300]
first_hun

Unnamed: 0,latitude,longitude,status_group
1,-2.147466,34.698766,functional
2,-3.821329,37.460664,functional
3,-11.155298,38.486161,non functional
4,-1.825359,31.130847,functional
5,-4.765587,39.172796,functional
...,...,...,...
295,-5.973474,35.865939,functional
296,-10.846660,39.734451,non functional
297,-1.849874,33.048573,non functional
298,-9.247810,32.626213,non functional


In [89]:
df_functional = first_hun[first_hun['status_group'] == 'functional']
df_functional

Unnamed: 0,latitude,longitude,status_group
1,-2.147466,34.698766,functional
2,-3.821329,37.460664,functional
4,-1.825359,31.130847,functional
5,-4.765587,39.172796,functional
9,-1.257051,30.626991,functional
...,...,...,...
288,-7.908075,31.539691,functional
292,-3.178814,36.907945,functional
293,-3.304505,35.733638,functional
294,-3.308150,37.370375,functional


In [90]:
df_nonfunctional = first_hun[first_hun['status_group'] == 'non functional']
df_nonfunctional

Unnamed: 0,latitude,longitude,status_group
3,-11.155298,38.486161,non functional
6,-3.766365,33.362410,non functional
7,-4.226198,32.620617,non functional
8,-5.146712,32.711100,non functional
16,-9.106185,34.642439,non functional
...,...,...,...
291,-4.262444,34.845394,non functional
296,-10.846660,39.734451,non functional
297,-1.849874,33.048573,non functional
298,-9.247810,32.626213,non functional


In [91]:
folium.Marker


folium.map.Marker

In [92]:
m = folium.Map(location=[-6.3728253, 34.8924826], zoom_start=6)


In [93]:
for (index, row) in df_functional.iterrows():
    folium.Marker(location=[row.loc['latitude'], row.loc['longitude']], icon=folium.Icon(color="green")).add_to(m)
m

In [94]:
n = folium.Map(location=[-6.3728253, 34.8924826], zoom_start=6)


In [87]:
for (index, row) in df_nonfunctional.iterrows():
    folium.Marker(location=[row.loc['latitude'], row.loc['longitude']], icon=folium.Icon(color='red')).add_to(m)
m