# Premise

## Who we are: Aqua Partners - Public Sector Consulting Firm specialized in economic development

## Stakeholder: Tanzanian Government

## Objective: Assist the government in improving the funcitonality of future waterpumps installed in the country using the existing range of infrastructure 

## Dataset: Status and features of waterpumps installed in Tanzania to provide water supply to cities / villages



# Definitions 

amount_tsh - Total static head (amount water available to waterpoint) / date_recorded - The date the row was entered

funder - Who funded the well / gps_height - Altitude of the well

installer - Organization that installed the well / longitude - GPS coordinate 

latitude - GPS coordinate / wpt_name - Name of the waterpoint if there is one

basin - Geographic water basin / subvillage - Geographic location / region - Geographic location

region_code - Geographic location (coded)/ district_code - Geographic location (coded)

lga - Geographic location / ward - Geographic location /population - Population around the well

public_meeting - True/False / recorded_by - Group entering this row of data / scheme_management - Who operates the waterpoint

scheme_name - Who operates the waterpoint / permit - If the waterpoint is permitted

construction_year - Year the waterpoint was constructed / extraction_type - The kind of extraction the waterpoint uses

extraction_type_group - The kind of extraction the waterpoint uses / extraction_type_class - The kind of extraction the waterpoint uses

management - How the waterpoint is managed / management_group - How the waterpoint is managed

payment - What the water costs / payment_type - What the water costs

water_quality - The quality of the water / quality_group - The quality of the water

quantity - The quantity of water / quantity_group - The quantity of water

source - The source of the water / source_type - The source of the water

source_class - The source of the water / waterpoint_type - The kind of waterpoint

waterpoint_type_group - The kind of waterpoint

# Package Installation

In [1]:
import pandas as pd
import numpy as np
import geopandas as geo
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
from IPython.display import Image  
from pydotplus import graph_from_dot_data
from IPython.display import display
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from ml_repo import scores
from ml_repo import roc_plot
from ml_repo import annot

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


ModuleNotFoundError: No module named 'ml_repo'

# Dataset

The labels in this dataset are simple. There are three possible values:
functional - the waterpoint is operational and there are no repairs needed
functional needs repair - the waterpoint is operational, but needs repairs
non functional - the waterpoint is not operational

In [None]:
X = pd.read_csv('Data/X.csv')
y = pd.read_csv('Data/Y.csv')
X1 = pd.read_csv('Data/X1.csv')

df = pd.merge(y,X,on='id')

In [None]:
(df.subvillage.value_counts(normalize = True)*100).cumsum().plot()

In [None]:
df.info()

In [None]:
df['installer'].fillna('other', inplace = True)
df['funder'].fillna('other', inplace = True)
df['public_meeting'].fillna('False', inplace = True)
df['scheme_management'].fillna('other',inplace = True)
df['permit'].fillna('False', inplace = True)
df.drop(columns =['recorded_by', 'extraction_type_group','extraction_type_class', 'payment', 'quantity_group', 'source', 'source_class', 'id', 'num_private', 'quantity_group','waterpoint_type_group', 'wpt_name', 'scheme_name', 'amount_tsh'], inplace = True)
df['region_and_code'] = df['region'] + df['region_code'].map(str)
df.drop(columns = ['region', 'region_code'], inplace = True)
df.population.replace(0,df.groupby(['district_code']).population.mean(), inplace = True)


In [None]:
i = df.construction_year.median()
df.construction_year.replace(0,i,inplace=True)

j = df.groupby(['basin']).gps_height.mean()
df.gps_height.replace(0,j, inplace = True)



In [None]:
df['year'] = df.date_recorded.str[:4]
df['month'] = df.date_recorded.str[-4]
df.drop(columns = ['date_recorded'], inplace = True)
df['length_operation'] = df.year.map(int) - df.construction_year.map(int)

In [None]:
cols = [i for i in df.columns if type(df[i].iloc[0]) == str]
df[cols] = df[cols].where(df[cols].apply(lambda x: x.map(x.value_counts())) > 50,"other")

In [None]:
df.nunique()

# Data Transformation

In [None]:
scaler = MinMaxScaler()
conti = df.loc[:,['population', 'gps_height', 'length_operation']]
df[['population', 'gps_height', 'length_operation']] = scaler.fit_transform(df[['population', 'gps_height', 'length_operation']])
df

In [None]:
df.columns

In [None]:
y = df['status_group']

X = df.loc[:, ['funder', 'gps_height', 'installer',
       'basin', 'subvillage', 'district_code', 'lga', 'ward',
       'population', 'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type', 'management',
       'management_group', 'payment_type', 'water_quality', 'quality_group',
       'quantity', 'source_type', 'waterpoint_type', 'region_and_code', 'year',
       'month', 'length_operation']]

X1 = pd.get_dummies(X, prefix = ['funder', 'installer',
       'basin', 'subvillage', 'district_code', 'lga', 'ward',
       'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type', 'management',
       'management_group', 'payment_type', 'water_quality', 'quality_group',
       'quantity', 'source_type', 'waterpoint_type', 'region_and_code'])

y1 = pd.get_dummies(y, prefix = ['status_group'])
y1 = y1.iloc[:,0]
X1.shape

# Initial split of DataSet (Train / Test set)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=42, test_size=0.2)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42) 
skf = StratifiedKFold(n_splits=5,random_state=42)

# Model - Decision Tree

In [None]:
%%time

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

scores(model,X_train,X_val,y_train,y_val)
roc_plot(model,X_train,y_train,X_val,y_val)

In [None]:
%%time
clf = DecisionTreeClassifier()
param_grid = {'criterion':['gini','entropy'], 'max_depth': [2,3,4], 'min_samples_leaf':[1500, 2000, 2500]}
gs = GridSearchCV(clf, param_grid, cv = skf)

gs.fit(X_train, y_train)
gs.best_params_

In [None]:
y_preds = gs.predict(X_train)
print('Accuracy: ', accuracy_score(y_train, y_preds))

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 4, min_samples_leaf = 1500)
clf.fit(X_train, y_train) #encoded categorical variable using pd.get_dummies()

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(clf, out_file='tree.dot', 
                feature_names = X_train.columns,
                class_names = np.unique(y).astype(str),
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png 
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

# Model - Logistic Regression 

In [None]:
%%time

logreg = LogisticRegression(fit_intercept = False, C = 1e15, solver='liblinear')
model_log = logreg.fit(X_train, y_train)
model_log

In [None]:
model_log.coef_

In [None]:
train_prob = model_log.predict_proba(X_train)[:,1]
val_prob = model_log.predict_proba(X_val)[:,1]

In [None]:
from sklearn.metrics import roc_auc_score
train = roc_auc_score(y_train,train_prob)
val = roc_auc_score(y_val,val_prob)
print('train: ', round(train,2),'//',"validation: ", round(val,2))

In [None]:
from sklearn.metrics import roc_curve
plt.figure(figsize=(7,7))

for data in [[y_train, train_prob],[y_val, val_prob]]:
    fpr, tpr, threshold = roc_curve(data[0], data[1])
    plt.plot(fpr, tpr)
annot(fpr, tpr, threshold)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.ylabel('TPR (power)')
plt.xlabel('FPR (alpha)')
plt.legend(['train','val','test'])
plt.show()

# Model - Random Forest

In [None]:
from ml_repo import roc_plot


# Model - K-Nearest-Neighbors

# Model - Support Vector Machines