In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

# Load Dataset

In [4]:
! ls ../Data

test_set_values.csv     training_set_labels.csv training_set_values.csv


Dataset comes split in 3 different files, training values, training labes and test values.
since we dont have a file for test-labels, We need to merge training values and training labes, re-split and we'll get test data needed to run a confusion matrix and get scores.

In [5]:
df_X = pd.read_csv('../data/training_set_values.csv')

In [6]:
df_X.shape

(59400, 40)

In [7]:
df_X.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [8]:
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [14]:
df_X.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [None]:
At first sight, we have Null values in some of the columns.

## Load training labels dataset


In [9]:
df_y = pd.read_csv('../data/training_set_labels.csv')

In [10]:
df_y.shape

(59400, 2)

In [11]:
df_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            59400 non-null  int64 
 1   status_group  59400 non-null  object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


In [12]:
df_y['status_group'].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

Will have to work with a multiclass model, we have 3 different classes

In [20]:
df_y['status_group'].value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

We could replace 'functional needs repair'  with either of the other classes just try performance on a binary model

## Data exploration 

merging target and features into a single dataframe, just in case rows need to be dropped, and later spliting it 

In [13]:
df_raw = df_X.merge(df_y, on='id')

In [14]:
df_raw

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,2013-05-03,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
59396,27263,4700.0,2011-05-07,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional
59397,37057,0.0,2011-04-11,,0,,34.017087,-8.750434,Mashine,0,...,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional
59398,31282,0.0,2011-03-08,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional


## Meaning provided for each of the columns on [here](https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/25/)

`amount_tsh` - Total static head (amount water available to waterpoint)

`date_recorded` - The date the row was entered

`funder` - Who funded the well

`gps_height` - Altitude of the well

`installer` - Organization that installed the well

`longitude` - GPS coordinate

`latitude` - GPS coordinate

`wpt_name` - Name of the waterpoint if there is one

`num_private` -

`basin` - Geographic water basin

`subvillage` - Geographic location

`region` - Geographic location

`region_code` - Geographic location (coded)

`district_code` - Geographic location (coded)

`lga` - Geographic location

`ward` - Geographic location

`population` - Population around the well

`public_meeting` - True/False

`recorded_by` - Group entering this row of data

`scheme_management` - Who operates the waterpoint

`scheme_name` - Who operates the waterpoint

`permit` - If the waterpoint is permitted

`construction_year` - Year the waterpoint was constructed

`extraction_type` - The kind of extraction the waterpoint uses

`extraction_type_group` - The kind of extraction the waterpoint uses

`extraction_type_class` - The kind of extraction the waterpoint uses

`management` - How the waterpoint is managed

`management_group` - How the waterpoint is managed

`payment` - What the water costs

`payment_type` - What the water costs

`water_quality` - The quality of the water

`quality_group` - The quality of the water

`quantity` - The quantity of water

`quantity_group` - The quantity of water

`source` - The source of the water

`source_type` - The source of the water

`source_class` - The source of the water

`waterpoint_type` - The kind of waterpoint

`waterpoint_type_group` - The kind of waterpoint

In [18]:
df_raw.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

By team decision, i will take care of following nulls, and find an apropiate way to proceed 

`subvillage`...................371 nulls

`public_meeting`..........3334 nulls


In [16]:
df_raw['public_meeting']

0        True
1         NaN
2        True
3        True
4        True
         ... 
59395    True
59396    True
59397    True
59398    True
59399    True
Name: public_meeting, Length: 59400, dtype: object

There's no information on what `public_meeting` column represents, unable to find any useful info on neither source page or after runing web search. Will possibly drop column

Looking into `subvillage` column, trying to find pattern and fill null values accordingly

In [15]:
df_subvillage_null = df_raw[df_raw['subvillage'].isnull()]

In [18]:
df_subvillage_null

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
342,21127,0.0,2011-03-16,Government Of Tanzania,0,North,36.557631,-6.233394,Kwa Mihinzo,0,...,soft,good,dry,dry,spring,spring,groundwater,communal standpipe multiple,communal standpipe,non functional
360,51558,0.0,2011-03-25,Commu,0,Commu,36.416701,-6.220157,Kwa Emanuel,0,...,soft,good,dry,dry,spring,spring,groundwater,communal standpipe,communal standpipe,non functional
379,53847,0.0,2011-03-20,World Bank,0,Rhobi,36.729383,-6.084255,Kwa Dimanyi,0,...,salty,salty,insufficient,insufficient,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,functional
565,27334,0.0,2011-03-18,World Bank,0,Rhoda,36.696881,-5.993192,Mpande,0,...,salty,salty,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
966,17088,0.0,2011-03-11,Water,0,Commu,36.322623,-6.030500,Kwa Charles,0,...,salty,salty,enough,enough,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59008,16353,0.0,2011-03-23,Commu,0,Commu,36.770490,-6.288555,Kwa Mlima,0,...,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional
59091,45206,0.0,2011-03-14,Lvia,0,Commu,36.407974,-5.715084,Kwa Emson,0,...,salty,salty,dry,dry,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,non functional
59105,12248,0.0,2011-03-19,World Bank,0,Rhobi,36.889359,-5.959966,Kwa Mahimbo,0,...,salty,salty,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
59215,46441,0.0,2011-03-19,World Bank,0,Rhobi,36.854216,-6.010508,Kwa Sila,0,...,salty,salty,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional


Dataframe has 41 columns, it doesnt display all at once, will print not shown columns below, and take a look at them

In [19]:
df_subvillage_null.iloc[:,10:31]

Unnamed: 0,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,...,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type
342,Wami / Ruvu,,Dodoma,1,3,Kongwa,Sagara,0,True,GeoData Consultants Ltd,...,Saga,False,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay
360,Wami / Ruvu,,Dodoma,1,3,Kongwa,Kongwa Urban,0,True,GeoData Consultants Ltd,...,Mlan,False,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay
379,Wami / Ruvu,,Dodoma,1,3,Kongwa,Pandambili,0,True,GeoData Consultants Ltd,...,Pand,False,0,submersible,submersible,submersible,vwc,user-group,pay per bucket,per bucket
565,Wami / Ruvu,,Dodoma,1,3,Kongwa,Njoge,0,True,GeoData Consultants Ltd,...,Hemb,False,0,submersible,submersible,submersible,vwc,user-group,pay per bucket,per bucket
966,Wami / Ruvu,,Dodoma,1,3,Kongwa,Sejeli,0,True,GeoData Consultants Ltd,...,Seje,True,0,mono,mono,motorpump,vwc,user-group,pay per bucket,per bucket
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59008,Wami / Ruvu,,Dodoma,1,3,Kongwa,Mlali,0,True,GeoData Consultants Ltd,...,Mlal,False,0,gravity,gravity,gravity,vwc,user-group,never pay,never pay
59091,Wami / Ruvu,,Dodoma,1,3,Kongwa,Zoissa,0,True,GeoData Consultants Ltd,...,Zois,False,0,mono,mono,motorpump,vwc,user-group,pay per bucket,per bucket
59105,Wami / Ruvu,,Dodoma,1,3,Kongwa,Pandambili,0,True,GeoData Consultants Ltd,...,Kite,False,0,submersible,submersible,submersible,vwc,user-group,never pay,never pay
59215,Wami / Ruvu,,Dodoma,1,3,Kongwa,Pandambili,0,True,GeoData Consultants Ltd,...,Kite,False,0,submersible,submersible,submersible,vwc,user-group,never pay,never pay


Seems like most null values in the `subvillage` column are from region Dodoma, and could be posible not village but just wards near the capital, dodoma, that could explain the Null value

In [20]:
df_subvillage_null['region'].value_counts()

Dodoma    361
Mwanza     10
Name: region, dtype: int64

Out of all `subvillage` null values, 361 show Dodoma on `region`, other 10 show Mwanza.
All show the same region code, and district_code, it is safe to assume all null values represent a single value, maybe an area not identified as a village.

looks like `distric_code` is actually representing the region, http://www.statoids.com/utz.html 
we could drop one of those columns, or both in case the geographical data in not needed


`amount_tsh` shows lots of 0s, and in some data points the well doesnt have any water but the mecanism to make it work is still functional..?

In [21]:
df_subvillage_null['amount_tsh'].value_counts()

0.0    371
Name: amount_tsh, dtype: int64

In [22]:
df_raw['amount_tsh'].value_counts()

0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
8500.0          1
6300.0          1
220.0           1
138000.0        1
12.0            1
Name: amount_tsh, Length: 98, dtype: int64

All rows where subvillage is null show a 0 on the `amount_tsh` column, but on the whole dataset it goes up to 41 thousand rows showing a 0 value,  we might have to drop it

In [84]:
df_raw['management'].value_counts()

vwc                 40507
wug                  6515
water board          2933
wua                  2535
private operator     1971
parastatal           1768
water authority       904
other                 844
company               685
unknown               561
other - school         99
trust                  78
Name: management, dtype: int64

In [75]:
df_raw['population'].value_counts()

0       21381
1        7025
200      1940
150      1892
250      1681
        ...  
3241        1
1960        1
1685        1
2248        1
1439        1
Name: population, Length: 1049, dtype: int64

In [79]:
df_raw


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,2013-05-03,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
59396,27263,4700.0,2011-05-07,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional
59397,37057,0.0,2011-04-11,,0,,34.017087,-8.750434,Mashine,0,...,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional
59398,31282,0.0,2011-03-08,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [78]:
df_raw.iloc[:,10:31]

Unnamed: 0,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,...,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type
0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,...,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually
1,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,...,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay
2,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,...,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket
3,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,...,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay
4,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,...,,True,0,gravity,gravity,gravity,other,other,never pay,never pay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,Pangani,Kiduruni,Kilimanjaro,3,5,Hai,Masama Magharibi,125,True,GeoData Consultants Ltd,...,Losaa Kia water supply,True,1999,gravity,gravity,gravity,water board,user-group,pay per bucket,per bucket
59396,Rufiji,Igumbilo,Iringa,11,4,Njombe,Ikondo,56,True,GeoData Consultants Ltd,...,Ikondo electrical water sch,True,1996,gravity,gravity,gravity,vwc,user-group,pay annually,annually
59397,Rufiji,Madungulu,Mbeya,12,7,Mbarali,Chimala,0,True,GeoData Consultants Ltd,...,,False,0,swn 80,swn 80,handpump,vwc,user-group,pay monthly,monthly
59398,Rufiji,Mwinyi,Dodoma,1,4,Chamwino,Mvumi Makulu,0,True,GeoData Consultants Ltd,...,,True,0,nira/tanira,nira/tanira,handpump,vwc,user-group,never pay,never pay


# My notes, ( will delete these at the end)
things in blue(#) were adressed unsing proper markdowns 


### Useless columns so far:

num_private..................................58643 rows showing a 0 value

water_quality	quality_group................seem like duplicate columns with minor changes (soft = good)

quantity	quantity_group...................seems like duplicate columns

source	 source_type..........................Duplicate

waterpoint_type  	 waterpoint_type_group....Duplicate

#### region and  district_code......................duplicate, per this webpage, distric code is assigned by region http://www.statoids.com/utz.html

extraction_type
extraction_type_group	   extraction_type_class......will keep extraction_type and extraction_type_class

population ....................................shows to many   0 - 21381,   1 - 7025

construction year..............................20,000  rows with 0 as value     

wpt_name ........................................not useful, just the wells name

date_recorded......................................not usefull

recorded_by........................................not usefull

payment	payment_type ..............................duplicate


pending to drop:

funder	gps_height	installer	longitude	latitude   scheme_management   scheme_name

## NOTES 

#### seems like all null values in the village  column most of them are from region dodoma, and could be posible not village but just wards  near the capital, dodoma


#### we could replace / impute na with nearest neigbors ,  dealing with null on subvillage, most rows with missing subvillage have district_code 3 so it's safe to assume all those rows belong to the same subvillage


#### seems like amount_tsh have alot of 0, the well doesnt have any water but the mecanism to make it work is still functional..?


361 rows with null value on village show no construction year, but over all is best to drop construction year since we have 20 thousand 0 values on it


#### public_meeting - unable to find meaning behind it, 3334 values missing






# Drop duplicates and not useful columns

In [127]:
df_Dropped = df_raw.drop(columns=['num_private', 'water_quality', 'quantity', 'source_type', 
                                  'waterpoint_type', 'region_code', 'extraction_type_group',
                                  'population', 'construction_year', 'public_meeting', 'wpt_name',
                                  'recorded_by', 'payment', 'amount_tsh', 'funder', 'installer', 
                                  'gps_height', 'longitude', 'latitude', 'subvillage', 'region', 
                                  'district_code', 'lga'])

df_Dropped

Unnamed: 0,id,date_recorded,basin,ward,scheme_management,scheme_name,permit,extraction_type,extraction_type_class,management,management_group,payment_type,quality_group,quantity_group,source,source_class,waterpoint_type_group,status_group
0,69572,2011-03-14,Lake Nyasa,Mundindi,VWC,Roman,False,gravity,gravity,vwc,user-group,annually,good,enough,spring,groundwater,communal standpipe,functional
1,8776,2013-03-06,Lake Victoria,Natta,Other,,True,gravity,gravity,wug,user-group,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,functional
2,34310,2013-02-25,Pangani,Ngorika,VWC,Nyumba ya mungu pipe scheme,True,gravity,gravity,vwc,user-group,per bucket,good,enough,dam,surface,communal standpipe,functional
3,67743,2013-01-28,Ruvuma / Southern Coast,Nanyumbu,VWC,,True,submersible,submersible,vwc,user-group,never pay,good,dry,machine dbh,groundwater,communal standpipe,non functional
4,19728,2011-07-13,Lake Victoria,Nyakasimbi,,,True,gravity,gravity,other,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,2013-05-03,Pangani,Masama Magharibi,Water Board,Losaa Kia water supply,True,gravity,gravity,water board,user-group,per bucket,good,enough,spring,groundwater,communal standpipe,functional
59396,27263,2011-05-07,Rufiji,Ikondo,VWC,Ikondo electrical water sch,True,gravity,gravity,vwc,user-group,annually,good,enough,river,surface,communal standpipe,functional
59397,37057,2011-04-11,Rufiji,Chimala,VWC,,False,swn 80,handpump,vwc,user-group,monthly,fluoride,enough,machine dbh,groundwater,hand pump,functional
59398,31282,2011-03-08,Rufiji,Mvumi Makulu,VWC,,True,nira/tanira,handpump,vwc,user-group,never pay,good,insufficient,shallow well,groundwater,hand pump,functional


## will create a column showing if the record was taken on dry month or other, using the date_recorder column


https://www.nathab.com/know-before-you-go/african-safaris/east-africa/weather-climate/tanzania/#:~:text=The%20dry%20season%2C%20with%20cooler,a%20chance%20of%20afternoon%20showers.


The dry season, with cooler temperatures, lasts from May to October.

Summer usually lasts from November to March, and during this time there is always a chance of afternoon showers. 

and april shows the highes average of inches of rain per month

jan is considered a short dry season between, months that rains alot.




In [113]:
df_Dropped['date_recorded'][1][5:7]

'03'

https://altezza.travel/en/articles/weather

In [128]:
df_Dropped['season'] = pd.DatetimeIndex(df_Dropped['date_recorded']).month

In [129]:
df_Dropped ['precipitation (mm)'] = df_Dropped['season'].map({1:49, 2:30, 3:85, 4:153, 5:126, 6:32,
                                                         7:13, 8:18, 9:21, 10:48, 11:132, 12:75})
df_Dropped.head ( )

Unnamed: 0,id,date_recorded,basin,ward,scheme_management,scheme_name,permit,extraction_type,extraction_type_class,management,management_group,payment_type,quality_group,quantity_group,source,source_class,waterpoint_type_group,status_group,season,precipitation (mm)
0,69572,2011-03-14,Lake Nyasa,Mundindi,VWC,Roman,False,gravity,gravity,vwc,user-group,annually,good,enough,spring,groundwater,communal standpipe,functional,3,85
1,8776,2013-03-06,Lake Victoria,Natta,Other,,True,gravity,gravity,wug,user-group,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,3,85
2,34310,2013-02-25,Pangani,Ngorika,VWC,Nyumba ya mungu pipe scheme,True,gravity,gravity,vwc,user-group,per bucket,good,enough,dam,surface,communal standpipe,functional,2,30
3,67743,2013-01-28,Ruvuma / Southern Coast,Nanyumbu,VWC,,True,submersible,submersible,vwc,user-group,never pay,good,dry,machine dbh,groundwater,communal standpipe,non functional,1,49
4,19728,2011-07-13,Lake Victoria,Nyakasimbi,,,True,gravity,gravity,other,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,7,13


In [130]:
df_Dropped.drop(columns=['season', 'date_recorded'], inplace=True)

In [131]:
df_Dropped

Unnamed: 0,id,basin,ward,scheme_management,scheme_name,permit,extraction_type,extraction_type_class,management,management_group,payment_type,quality_group,quantity_group,source,source_class,waterpoint_type_group,status_group,precipitation (mm)
0,69572,Lake Nyasa,Mundindi,VWC,Roman,False,gravity,gravity,vwc,user-group,annually,good,enough,spring,groundwater,communal standpipe,functional,85
1,8776,Lake Victoria,Natta,Other,,True,gravity,gravity,wug,user-group,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,85
2,34310,Pangani,Ngorika,VWC,Nyumba ya mungu pipe scheme,True,gravity,gravity,vwc,user-group,per bucket,good,enough,dam,surface,communal standpipe,functional,30
3,67743,Ruvuma / Southern Coast,Nanyumbu,VWC,,True,submersible,submersible,vwc,user-group,never pay,good,dry,machine dbh,groundwater,communal standpipe,non functional,49
4,19728,Lake Victoria,Nyakasimbi,,,True,gravity,gravity,other,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,Pangani,Masama Magharibi,Water Board,Losaa Kia water supply,True,gravity,gravity,water board,user-group,per bucket,good,enough,spring,groundwater,communal standpipe,functional,126
59396,27263,Rufiji,Ikondo,VWC,Ikondo electrical water sch,True,gravity,gravity,vwc,user-group,annually,good,enough,river,surface,communal standpipe,functional,126
59397,37057,Rufiji,Chimala,VWC,,False,swn 80,handpump,vwc,user-group,monthly,fluoride,enough,machine dbh,groundwater,hand pump,functional,153
59398,31282,Rufiji,Mvumi Makulu,VWC,,True,nira/tanira,handpump,vwc,user-group,never pay,good,insufficient,shallow well,groundwater,hand pump,functional,85


## creating a simple model 

In [164]:
X = df_Dropped.drop(columns=['status_group', 'management', 'management_group', 
                             'scheme_management', 'scheme_name', 'permit'])
y = df_Dropped['status_group']

In [165]:
X_dummie = pd.get_dummies(data= X,  drop_first=True)

In [166]:
X_dummie

Unnamed: 0,id,precipitation (mm),basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,...,source_shallow well,source_spring,source_unknown,source_class_surface,source_class_unknown,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,69572,85,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
1,8776,85,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,34310,30,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,67743,49,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,19728,13,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,126,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
59396,27263,126,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
59397,37057,153,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
59398,31282,85,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0


In [168]:
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X_dummie, y)

DecisionTreeClassifier(max_depth=3, random_state=42)

In [171]:
tree_clf.score(X_dummie, y)

0.6764141414141415

In [None]:
LOL