In [45]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix
from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve

sub_format = pd.read_csv('../data/SubmissionFormat (1).csv')
test = pd.read_csv('../data/test.csv')
labels = pd.read_csv('../data/labels.csv')
train = pd.read_csv('../data/trainingvalues.csv')

In [46]:
sub_format.head(2)

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label


In [47]:
test.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe


In [48]:
labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [49]:
labels['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [50]:
train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [51]:
train['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [52]:
train['waterpoint_type_group'].value_counts()

communal standpipe    34625
hand pump             17488
other                  6380
improved spring         784
cattle trough           116
dam                       7
Name: waterpoint_type_group, dtype: int64

In [53]:
train['source_class'].value_counts()

groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

In [54]:
train['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [55]:
train.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [57]:
labels.replace(to_replace='functional needs repair', value='functional', inplace=True)
labels.drop(['id'], axis=1, inplace=True)

In [58]:
labels['status_group'].value_counts()

functional        36576
non functional    22824
Name: status_group, dtype: int64

In [59]:
 train.drop(['id', 'date_recorded', 'funder', 'wpt_name', 'subvillage', 'lga', 'ward', 'public_meeting', 'recorded_by', 
             'scheme_name', 'permit', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group',
             'payment', 'payment_type', 'quality_group', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group'],
             axis=1, inplace=True)

In [95]:
X = train
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [96]:
y_train

Unnamed: 0,status_group
43949,non functional
35907,functional
16054,functional
11117,functional
25644,functional
...,...
16663,non functional
36766,functional
15124,non functional
29638,non functional


In [61]:
subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer()),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, ['amount_tsh', 'gps_height', 'longitude',
                                                                   'latitude', 'region_code', 'district_code',
                                                                   'construction_year']),
                                     ('subpipe_cat', subpipe_cat, ['installer', 'basin', 'region',
                                                                      'scheme_management', 'extraction_type',
                                                                      'water_quality', 'quantity', 'source',
                                                                      'waterpoint_type'])],
                                     remainder='passthrough', sparse_threshold=0)

KNN models

In [62]:
from sklearn.neighbors import KNeighborsClassifier

In [63]:
knn_model1_pipe = Pipeline(steps=[('ct', CT),
                            ('knn', KNeighborsClassifier())])

In [64]:
knn_model1_pipe.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [65]:
knn_model1_pipe.score(X_train, y_train)

0.8495173961840629

In [97]:
y_pred_knn1 = knn_model1_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_knn1))

                precision    recall  f1-score   support

    functional       0.84      0.91      0.87     27542
non functional       0.83      0.72      0.77     17008

      accuracy                           0.84     44550
     macro avg       0.83      0.81      0.82     44550
  weighted avg       0.83      0.84      0.83     44550



In [66]:
knn_model2_pipe = Pipeline(steps=[('ct', CT),
                            ('knn', KNeighborsClassifier(n_neighbors=50))])

In [67]:
knn_model2_pipe.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [68]:
knn_model2_pipe.score(X_train, y_train)

0.7523456790123457

In [98]:
y_pred_knn2 = knn_model2_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_knn2))

                precision    recall  f1-score   support

    functional       0.74      0.93      0.82     27542
non functional       0.80      0.46      0.59     17008

      accuracy                           0.75     44550
     macro avg       0.77      0.70      0.70     44550
  weighted avg       0.76      0.75      0.73     44550



In [69]:
knn_model3_pipe = Pipeline(steps=[('ct', CT),
                            ('knn', KNeighborsClassifier(n_neighbors=50, metric="manhattan"))])

In [70]:
knn_model3_pipe.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [88]:
knn_model3_pipe.score(X_train, y_train)

0.7614814814814815

In [99]:
y_pred_knn3 = knn_model3_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_knn3))

                precision    recall  f1-score   support

    functional       0.75      0.93      0.83     27542
non functional       0.80      0.49      0.61     17008

      accuracy                           0.76     44550
     macro avg       0.77      0.71      0.72     44550
  weighted avg       0.77      0.76      0.74     44550



In [92]:
knn_model4_pipe = Pipeline(steps=[('ct', CT),
                            ('knn', KNeighborsClassifier(n_neighbors=75, metric="manhattan"))])

In [93]:
knn_model4_pipe.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [94]:
knn_model4_pipe.score(X_train, y_train)

0.7498765432098765

In [100]:
y_pred_knn4 = knn_model4_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_knn4))

                precision    recall  f1-score   support

    functional       0.74      0.93      0.82     27542
non functional       0.80      0.46      0.58     17008

      accuracy                           0.75     44550
     macro avg       0.77      0.69      0.70     44550
  weighted avg       0.76      0.75      0.73     44550



Decision Tree Model

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dtree1_pipe = Pipeline(steps=[('ct', CT),
                            ('dtree', DecisionTreeClassifier(random_state=42))])

In [74]:
dtree1_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [75]:
dtree1_pipe.score(X_train, y_train)

0.9985634118967452

In [101]:
y_pred_dtree1 = dtree1_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_dtree1))

                precision    recall  f1-score   support

    functional       0.96      0.96      0.96     27542
non functional       0.94      0.94      0.94     17008

      accuracy                           0.95     44550
     macro avg       0.95      0.95      0.95     44550
  weighted avg       0.95      0.95      0.95     44550



In [76]:
dtree2_pipe = Pipeline(steps=[('ct', CT),
                            ('dtree', DecisionTreeClassifier(random_state=42, min_samples_leaf=10))])

In [77]:
dtree2_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [78]:
dtree2_pipe.score(X_train, y_train)

0.8786980920314253

In [102]:
y_pred_dtree2 = dtree2_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_dtree2))

                precision    recall  f1-score   support

    functional       0.87      0.92      0.89     27542
non functional       0.85      0.79      0.82     17008

      accuracy                           0.87     44550
     macro avg       0.86      0.85      0.86     44550
  weighted avg       0.87      0.87      0.87     44550



In [79]:
dtree3_pipe = Pipeline(steps=[('ct', CT),
                            ('dtree', DecisionTreeClassifier(random_state=42, min_samples_leaf=100))])

In [80]:
dtree3_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [81]:
dtree3_pipe.score(X_train, y_train)

0.8068462401795735

In [103]:
y_pred_dtree3 = dtree3_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_dtree3))

                precision    recall  f1-score   support

    functional       0.81      0.90      0.85     27542
non functional       0.81      0.65      0.72     17008

      accuracy                           0.81     44550
     macro avg       0.81      0.78      0.79     44550
  weighted avg       0.81      0.81      0.80     44550



In [82]:
dtree4_pipe = Pipeline(steps=[('ct', CT),
                            ('dtree', DecisionTreeClassifier(random_state=42, min_samples_leaf=100, class_weight="balanced"))])

In [83]:
dtree4_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [84]:
dtree4_pipe.score(X_train, y_train)

0.8020426487093154

In [104]:
y_pred_dtree4 = dtree4_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_dtree4))

                precision    recall  f1-score   support

    functional       0.85      0.82      0.84     27542
non functional       0.73      0.77      0.75     17008

      accuracy                           0.80     44550
     macro avg       0.79      0.79      0.79     44550
  weighted avg       0.80      0.80      0.80     44550



In [85]:
dtree5_pipe = Pipeline(steps=[('ct', CT),
                            ('dtree', DecisionTreeClassifier(random_state=42, min_samples_leaf=100, max_features="sqrt"))])

In [86]:
dtree5_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'longitude', 'latitude',
                                                   'region_code',
                                                   'district_code',
                                                   'construction_year']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_imput

In [87]:
dtree5_pipe.score(X_train, y_train)

0.6306172839506173

In [105]:
y_pred_dtree5 = dtree5_pipe.predict(X_train) 
print(classification_report(y_train, y_pred_dtree5))

                precision    recall  f1-score   support

    functional       0.64      0.95      0.76     27542
non functional       0.60      0.11      0.19     17008

      accuracy                           0.63     44550
     macro avg       0.62      0.53      0.48     44550
  weighted avg       0.62      0.63      0.54     44550

