## Tanzania Water Wells Classification Modelling

### Importing relevant dependancies

In [75]:
# importations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from functions import drop_artefacts_and_nulls, ternary_to_binary, calculate_age
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

#read training data
train_set = pd.read_csv('Data/labelled_train_set.csv')
train_targed = pd.read_csv('Data/train_set_labels.csv')

#train set
train_set.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,needs_repair
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


### Preprocessing

In [76]:
#list column names for categorical and numerical
cat_cols = drop_artefacts_and_nulls(train_set).select_dtypes(include='object').columns
num_cols = drop_artefacts_and_nulls(train_set).select_dtypes(include='number').columns
print('Categorical:\n', cat_cols)
print('Numerical:\n', num_cols)

Categorical:
 Index(['date_recorded', 'installer', 'basin', 'subvillage', 'lga', 'ward',
       'scheme_management', 'permit', 'extraction_type_class',
       'management_group', 'quality_group', 'quantity', 'source',
       'waterpoint_type', 'status_group'],
      dtype='object')
Numerical:
 Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code',
       'district_code', 'population', 'construction_year'],
      dtype='object')


**Transformers**

In [82]:
#function transformer for dropping irrelevant features and nulls
feature_selector = FunctionTransformer(drop_artefacts_and_nulls)

#feature engineering to calculate age
age_transformer = FunctionTransformer(calculate_age)


#feature OneHotEncoder
feat_ohe = OneHotEncoder(drop='first', sparse=False)

#categorical column encoder
cat_ohe_transformer = ColumnTransformer(transformers=[
    ('cat_ohe', feat_ohe, cat_cols)
])

#function transformer to make the problem binary classification
binary_target = FunctionTransformer(ternary_to_binary)

#label encoder for target
label_enc = ColumnTransformer(transformers=[
    ('labenc', LabelEncoder(), ['status_group'],)
], remainder='passthrough')

#column transformer to transform target
target_transformer = ColumnTransformer(transformers=[
    ('binary_target', binary_target, ['status_group']),
])

**Cleaning Pipelines**

In [83]:
# cleaning pipeline
feature_cleaner_norm = Pipeline(steps=[
    ('feat_select', feature_selector),
    ('age_trans', age_transformer)
])
#cleaning pipeline with ohe
feature_cleaner_ohe = Pipeline(steps=[
    ('feat_select', feature_selector), 
    ('feat_ohe', cat_ohe_transformer),
    ('age_trans', age_transformer) 
])

target_cleaner= Pipeline(steps=[
    ('feat_select', feature_selector),
    ('age_trans', age_transformer),
    ('target_trans', target_transformer),
    ('labeller', label_enc)
])


In [84]:
#fit cleaned data into pipeline
clean_X_train = pd.DataFrame(feature_cleaner_norm.fit_transform(train_set))
clean_X_train.head()


Unnamed: 0,amount_tsh,gps_height,installer,longitude,latitude,basin,subvillage,region_code,district_code,lga,...,permit,construction_year,extraction_type_class,management_group,quality_group,quantity,source,waterpoint_type,status_group,age
0,6000.0,1390,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,11,5,Ludewa,...,False,1999,gravity,user-group,good,enough,spring,communal standpipe,functional,12
1,0.0,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,20,2,Serengeti,...,True,2010,gravity,user-group,good,insufficient,rainwater harvesting,communal standpipe,functional,3
2,25.0,686,World vision,37.460664,-3.821329,Pangani,Majengo,21,4,Simanjiro,...,True,2009,gravity,user-group,good,enough,dam,communal standpipe multiple,functional,4
3,0.0,263,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,90,63,Nanyumbu,...,True,1986,submersible,user-group,good,dry,machine dbh,communal standpipe multiple,needs_repair,27
5,20.0,0,DWE,39.172796,-4.765587,Pangani,Moa/Mwereme,4,8,Mkinga,...,True,2009,submersible,user-group,salty,enough,other,communal standpipe multiple,functional,2


In [86]:
#fit target
y_train = pd.DataFrame(target_cleaner.fit_transform(train_set))

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

**Modelling Pipelines**

In [80]:
#Logisti Regression Pipeline

### Modelling 

The models to be built and evaluated are:
1. `logistic regression` (baseline)
2. `decision tree classifier`
3. `random forest classifier`
4. `knn classifier`

### Evaluation 

### Conclusion