## Tanzania Water Wells Classification Modelling

### Importing relevant dependancies

In [143]:
# importations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV,\
    cross_validate
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, plot_confusion_matrix

from functions import drop_artefacts_and_nulls, ternary_to_binary, calculate_age
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

#read training data
train_set = pd.read_csv('Data/labelled_train_set.csv')
train_targed = pd.read_csv('Data/train_set_labels.csv')

#train set
train_set.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,needs_repair
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


### Preprocessing

In [144]:
#list column names for categorical and numerical
cat_cols = drop_artefacts_and_nulls(train_set).select_dtypes(include='object').columns
num_cols = drop_artefacts_and_nulls(train_set).select_dtypes(include='number').columns
print('Categorical:\n', cat_cols)
print('Numerical:\n', num_cols)

Categorical:
 Index(['date_recorded', 'installer', 'basin', 'subvillage', 'lga', 'ward',
       'scheme_management', 'permit', 'extraction_type_class',
       'management_group', 'quality_group', 'quantity', 'source',
       'waterpoint_type', 'status_group'],
      dtype='object')
Numerical:
 Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code',
       'district_code', 'population', 'construction_year'],
      dtype='object')


In [145]:
#column groups
cat_cols = ['installer', 'basin', 'subvillage', 'lga', 'ward', 'scheme_management',
       'permit', 'extraction_type_class', 'management_group', 'quality_group',
       'quantity', 'source', 'waterpoint_type']

num_cols = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code',
       'district_code', 'population', 'construction_year', 'age']

**Transformers**

In [146]:
#function transformer for dropping irrelevant features and nulls
feature_selector = FunctionTransformer(drop_artefacts_and_nulls)

#feature engineering to calculate age
age_transformer = FunctionTransformer(calculate_age)

#categorical transformer
cat_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

#numerical transformer
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

#Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

#function transformer to make the problem binary classification
binary_target = FunctionTransformer(ternary_to_binary)

#column transformer to transform target
target_transformer = ColumnTransformer(transformers=[
    ('binary_target', binary_target, ['status_group']),
])

**Cleaning Pipelines**

In [147]:
# cleaning pipeline
feature_cleaner_norm = Pipeline(steps=[
    ('feat_select', feature_selector),
    ('age_trans', age_transformer)
])
#cleaning pipeline with ohe
feature_cleaner_ohe = Pipeline(steps=[
    ('feat_select', feature_selector),
    ('age_trans', age_transformer) 
])

target_cleaner= Pipeline(steps=[
    ('feat_select', feature_selector),
    ('age_trans', age_transformer),
    ('target_trans', target_transformer)
])


In [148]:
#fit cleaned data into pipeline
clean_X_train = pd.DataFrame(feature_cleaner_norm.fit_transform(train_set)).drop('status_group', axis=1)
clean_X_train.head()


Unnamed: 0,amount_tsh,gps_height,installer,longitude,latitude,basin,subvillage,region_code,district_code,lga,...,scheme_management,permit,construction_year,extraction_type_class,management_group,quality_group,quantity,source,waterpoint_type,age
0,6000.0,1390,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,11,5,Ludewa,...,VWC,False,1999,gravity,user-group,good,enough,spring,communal standpipe,12
1,0.0,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,20,2,Serengeti,...,Other,True,2010,gravity,user-group,good,insufficient,rainwater harvesting,communal standpipe,3
2,25.0,686,World vision,37.460664,-3.821329,Pangani,Majengo,21,4,Simanjiro,...,VWC,True,2009,gravity,user-group,good,enough,dam,communal standpipe multiple,4
3,0.0,263,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,90,63,Nanyumbu,...,VWC,True,1986,submersible,user-group,good,dry,machine dbh,communal standpipe multiple,27
5,20.0,0,DWE,39.172796,-4.765587,Pangani,Moa/Mwereme,4,8,Mkinga,...,VWC,True,2009,submersible,user-group,salty,enough,other,communal standpipe multiple,2


In [149]:
# reassign column groups
cat_cols = clean_X_train.select_dtypes(include='object').columns
num_cols = clean_X_train.select_dtypes(include='number').columns
print('Categorical:\n', cat_cols)
print('Numerical:\n', num_cols)

Categorical:
 Index(['installer', 'basin', 'subvillage', 'lga', 'ward', 'scheme_management',
       'permit', 'extraction_type_class', 'management_group', 'quality_group',
       'quantity', 'source', 'waterpoint_type'],
      dtype='object')
Numerical:
 Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code',
       'district_code', 'population', 'construction_year', 'age'],
      dtype='object')


In [150]:
#fit target
clean_y = pd.DataFrame(target_cleaner.fit_transform(train_set))
clean_y.head()

Unnamed: 0,0
0,functional
1,functional
2,functional
3,needs_repair
4,functional


In [151]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(clean_X_train, clean_y, test_size=.25, random_state=42)
X_train.head()

Unnamed: 0,amount_tsh,gps_height,installer,longitude,latitude,basin,subvillage,region_code,district_code,lga,...,scheme_management,permit,construction_year,extraction_type_class,management_group,quality_group,quantity,source,waterpoint_type,age
55777,0.0,0,DWE,31.152609,-1.371928,Lake Victoria,Kyabulondo,18,1,Karagwe,...,VWC,True,0,gravity,user-group,good,insufficient,river,communal standpipe,2011
27609,0.0,361,DWE,37.797958,-6.959244,Wami / Ruvu,Nemele,5,2,Morogoro Rural,...,VWC,True,1982,handpump,user-group,unknown,seasonal,shallow well,hand pump,29
40682,6500.0,1776,Dmdd,35.350637,-4.466087,Internal,Fisa,21,2,Hanang,...,VWC,True,2003,gravity,user-group,good,enough,spring,communal standpipe,10
1146,500.0,1011,RWE,29.676655,-4.762459,Lake Tanganyika,Bwelu A,16,3,Kigoma Rural,...,VWC,True,1985,gravity,user-group,good,insufficient,river,communal standpipe multiple,28
52747,0.0,0,Hesawa,33.288758,-3.035905,Lake Victoria,Nkola,19,4,Kwimba,...,VWC,True,0,handpump,user-group,good,enough,shallow well,hand pump,2011


In [154]:
#label encode y_train
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_train = y_train.reshape(-1, 1) 

### Modelling 

The models to be built and evaluated are:
1. `logistic regression` (baseline)
2. `decision tree classifier`
3. `random forest classifier`
4. `knn classifier`

**Modelling Pipelines**

In [155]:
#Logistic Regression Pipeline
logreg_pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('logreg', LogisticRegression(class_weight='balanced'))
])

In [156]:
logreg_pipe.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'population',
                                                   'construction_year',
                                                   'age']),
                                                 ('cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                          

In [157]:
res = cross_validate(logreg_pipe, X_train, y_train, scoring=['accuracy', 'precision'])

In [158]:
res['test_accuracy']

array([0.78205128, 0.79434851, 0.7926207 , 0.78777967, 0.78961141])

In [None]:
#KNN pipeline

In [None]:
#Decision Tree pipeline

In [None]:
#Random Forest Pipeline

### Evaluation 

### Conclusion