In [159]:
import pandas as pd
import numpy as np

In [2]:
train_values = pd.read_csv("../data/training_set_values.csv")
train_labels = pd.read_csv("../data/training_set_labels.csv")

In [3]:
train_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [4]:
train_values.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [5]:
df = train_values.merge(train_labels)

## Date Recorded

In [6]:
df["date_recorded"] = pd.to_datetime(df["date_recorded"])

In [7]:
df["date_recorded"].max()

Timestamp('2013-12-03 00:00:00')

In [8]:
df["date_recorded"].min()

Timestamp('2002-10-14 00:00:00')

## Scheme Management

In [9]:
df["scheme_management"].isna().sum()

3877

In [10]:
df["scheme_management"].value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [11]:
df["scheme_management"] = df["scheme_management"].fillna("None")

In [12]:
df["scheme_management"].isna().sum()

0

## Scheme Name

In [13]:
df["scheme_name"].isna().sum()

28166

In [14]:
df["scheme_name"].value_counts()

K                  682
None               644
Borehole           546
Chalinze wate      405
M                  400
                  ... 
Kanza                1
RC Water Supply      1
Tove -mtwango        1
BL Cosmas Woiso      1
Mlima wa Nyasho      1
Name: scheme_name, Length: 2696, dtype: int64

In [15]:
df["scheme_name"].isna().sum()

28166

## Scheme Management vs. Name

These categories are described identically in the dataframe description. Are they both useful?

In [31]:
df.loc[df['scheme_management'] == "VWC"]["scheme_name"].value_counts()

K                          571
DANIDA                     378
M                          331
Borehole                   285
Government                 249
                          ... 
Kisogwe                      1
Mamire water Supply          1
Rain water harest            1
Nkho                         1
KAIBANJA PUMPING SCHEME      1
Name: scheme_name, Length: 1861, dtype: int64

## Permit

In [16]:
df["permit"].isna().sum()

3056

In [17]:
df["permit"].value_counts()

True     38852
False    17492
Name: permit, dtype: int64

## Location

Finding missing coordinates to aid in visualization

In [61]:
df.loc[df['longitude'] == 0]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
21,6091,0.0,2013-02-10,Dwsp,0,DWE,0.0,-2.000000e-08,Muungano,0,...,unknown,unknown,unknown,unknown,shallow well,shallow well,groundwater,hand pump,hand pump,functional
53,32376,0.0,2011-08-01,Government Of Tanzania,0,Government,0.0,-2.000000e-08,Polisi,0,...,unknown,unknown,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
168,72678,0.0,2013-01-30,Wvt,0,WVT,0.0,-2.000000e-08,Wvt Tanzania,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
177,56725,0.0,2013-01-17,Netherlands,0,DWE,0.0,-2.000000e-08,Kikundi Cha Wakina Mama,0,...,soft,good,enough,enough,shallow well,shallow well,groundwater,other,other,non functional
253,13042,0.0,2012-10-29,Hesawa,0,DWE,0.0,-2.000000e-08,Kwakisusi,0,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional needs repair
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59189,62177,0.0,2011-07-18,Dwsp,0,DWE,0.0,-2.000000e-08,Wazazo,0,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional needs repair
59208,3631,0.0,2013-01-22,Dwsp,0,DWE,0.0,-2.000000e-08,Mtakuja,0,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
59295,60843,0.0,2011-07-19,Rwssp,0,DWE,0.0,-2.000000e-08,Maendeleo,0,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional needs repair
59324,748,0.0,2013-01-22,World Vision,0,World Vision,0.0,-2.000000e-08,Mwazwilo,0,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [108]:
df.loc[df['longitude'] == 0]["ward"].value_counts()

Chinamili      111
Nkololo         80
Somanda         72
Bumera          71
Kalangalala     66
              ... 
Nyugwa           4
Nkome            2
Shabaka          2
Bukondo          2
Senga            1
Name: ward, Length: 69, dtype: int64

In [148]:
coords = {"Chinamili":(-2.8732, 34.2205),
      "Nkololo":(-2.6419, 34.1601),
      "Somanda":(-3.3667, 33.9500),
      "Bumera":(-1.2844, 34.3251),
      "Kalangalala":(-2.8706, 32.2367),
      "Nkoma":(-3.5749, 34.3829),
      "Mkula":(-2.3000, 33.8833),
      "Nkungulu":(-3.0086, 33.4164),
      "Sakwe":(-2.7677, 33.8630),
      "Nyang'hwale":(-3.0856, 32.6277),
      "Kasamwa":(-2.8385, 32.4212),
      "Sapiwi":(-2.3684, 33.9655),
      "Lugulu":(-2.9141, 33.9583),
      "Mwamapalala":(-3.0165, 33.9138),
      "Kharumwa":(-3.2004, 32.6578),
      "Igalukilo":(-2.3759, 33.7987),
      "Kinang'weli":(-2.9438, 33.7441),
      "Kasoli":(-2.5695, 33.6712),
      "Nyakabindi":(-2.6410, 33.9903),
      "Zagayu":(-2.9655, 33.8096),
      "Mwadobana":(-2.5317, 34.0500),
      "Malili":(-2.4463, 33.8057),
      "Bunamhala":(-2.8534, 34.0623),
      "Lubanga":(-2.6320, 32.4018),
      "Nyaluhande":(-2.5004, 33.6108),
      "Mhango":(-2.7529, 33.9165),
      "Lagangabilili":(-2.9599, 34.1366),
      "Bukoli":(-3.1964, 32.3227),
      "Dutwa":(-2.5098, 33.9680),
      "Kakora":(-2.9657, 33.3346),
      "Mbita":(-2.8447, 33.7874),
      "Kalemela":(-2.2988, 33.7428),
      "Ikungulyabashashi":(-2.6080, 33.9491),
      "Busolwa":(-3.0024, 32.6309),
      "Mwingiro":(-3.1447, 32.5014),
      "Mwaswale":(-2.7824, 34.3522),
      "Kabita":(2.3204, 33.6698),
      "Katoro":(-3.0062, 31.9271),
      "Gamboshi":(-2.6055, 33.7681),
      "Mhunze":(-3.6195, 33.8128),
      "Badugu":(-2.4667, 33.6500),
      "Mwananyili":(-6.788, 39.256),
      "Mwaubingi":(-2.5223, 34.1136),
      "Chigunga":(-2.8140, 31.9338),
      "Shigala":(2.3924, 33.6763),
      "Bariadi":(-2.8070, 33.9917),
      "Kagu":(-2.7274, 32.0121),
      "Kiloleli":(-2.4909, 32.9206),
      "Nyachiluluma":(-2.7818, 31.9197),
      "Nzera":(-2.5214, 32.1561),
      "Bukwimba":(-3.3367, 32.6066),
      "Mtakuja":(-3.4868, 37.3573),
      "Nyamalimbe":(-3.0819, 32.3671),
      "Nyakamwaga":(2.9873, 32.2308),
      "Kaseme":(-3.1270, 31.9495),
      "Kamena":(-3.0724, 32.3249),
      "Sagata":(-2.7446, 34.2375),
      "Ngasamo":(-2.4900, 33.8442),
      "Lwamgasa":(-3.1166, 32.0417),
      "Kafita":(-3.2349, 32.5852),
      "Nyakagomba":(-2.8697, 31.9421),
      "Ihanamilo":(-2.9501, 32.2677),
      "Kamhanga":(-2.7346, 32.3087),
      "Busanda":(-3.0187, 32.1119),
      "Nyugwa":(-3.2237, 32.7380),
      "Nkome":(-2.5009, 32.0083),
      "Shabaka":(-2.9942, 32.4536),
      "Bukondo":(-2.6712, 31.9189),
      "Senga":(-4.4000, 34.6667)}

In [231]:
#Sanity check
missing_coords = list(df.loc[df['longitude'] == 0]["ward"].value_counts().keys())
missing_coords == list(coords.keys())

True

In [1]:
def get_lat(x):
    return coords[x][0]
def get_long(x):
    return coords[x][1]

In [2]:
df_miss = df.loc[df['longitude'] == 0]

NameError: name 'df' is not defined

In [3]:
df.loc[df['longitude'] == 0]["latitude"] = df_miss["ward"].apply(get_lat)
df.loc[df['longitude'] == 0]["longitude"] = df_miss["ward"].apply(get_long)

NameError: name 'df_miss' is not defined