<a href="https://colab.research.google.com/github/Nolanole/Tanzania-Well-Water-Project/blob/master/Notebook_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install category_encoders

In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import category_encoders as ce

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [0]:
#Files

train_features_url = 'https://raw.githubusercontent.com/Nolanole/Tanzania-Well-Water-Project/master/train_features.csv'
test_features_url = 'https://raw.githubusercontent.com/Nolanole/Tanzania-Well-Water-Project/master/test_features.csv'
train_labels_url = 'https://raw.githubusercontent.com/Nolanole/Tanzania-Well-Water-Project/master/train_labels.csv'
sample_submission_url = 'https://raw.githubusercontent.com/Nolanole/Tanzania-Well-Water-Project/master/sample_submission.csv'

In [0]:
#initial dataframes:

train = pd.read_csv(train_features_url)
test = pd.read_csv(test_features_url)
train_target = pd.read_csv(train_labels_url)

#merge train and target:
train = train.merge(train_target)

##Initial Data Exploration

In [5]:
train.shape, test.shape, train_target.shape

((59400, 41), (14358, 40), (59400, 2))

In [6]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,69572,8776,34310,67743,19728
amount_tsh,6000,0,25,0,0
date_recorded,2011-03-14,2013-03-06,2013-02-25,2013-01-28,2011-07-13
funder,Roman,Grumeti,Lottery Club,Unicef,Action In A
gps_height,1390,1399,686,263,0
installer,Roman,GRUMETI,World vision,UNICEF,Artisan
longitude,34.9381,34.6988,37.4607,38.4862,31.1308
latitude,-9.85632,-2.14747,-3.82133,-11.1553,-1.82536
wpt_name,none,Zahanati,Kwa Mahundi,Zahanati Ya Nanyumbu,Shuleni
num_private,0,0,0,0,0


In [7]:
train.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [8]:
train.describe(include='number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,59400.0,37115.131768,21453.128371,0.0,18519.75,37061.5,55656.5,74247.0
amount_tsh,59400.0,317.650385,2997.574558,0.0,0.0,0.0,20.0,350000.0
gps_height,59400.0,668.297239,693.11635,-90.0,0.0,369.0,1319.25,2770.0
longitude,59400.0,34.077427,6.567432,0.0,33.090347,34.908743,37.178387,40.34519
latitude,59400.0,-5.706033,2.946019,-11.64944,-8.540621,-5.021597,-3.326156,-2e-08
num_private,59400.0,0.474141,12.23623,0.0,0.0,0.0,0.0,1776.0
region_code,59400.0,15.297003,17.587406,1.0,5.0,12.0,17.0,99.0
district_code,59400.0,5.629747,9.633649,0.0,2.0,3.0,5.0,80.0
population,59400.0,179.909983,471.482176,0.0,0.0,25.0,215.0,30500.0
construction_year,59400.0,1300.652475,951.620547,0.0,0.0,1986.0,2004.0,2013.0


In [9]:
train.describe(exclude='number').T

Unnamed: 0,count,unique,top,freq
date_recorded,59400,356,2011-03-15,572
funder,55765,1897,Government Of Tanzania,9084
installer,55745,2145,DWE,17402
wpt_name,59400,37400,none,3563
basin,59400,9,Lake Victoria,10248
subvillage,59029,19287,Madukani,508
region,59400,21,Iringa,5294
lga,59400,125,Njombe,2503
ward,59400,2092,Igosi,307
public_meeting,56066,2,True,51011


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
id                       59400 non-null int64
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
r

In [0]:
int_cols = train.select_dtypes(['int']).columns.tolist()
float_cols = train.select_dtypes(['float']).columns.tolist()

##Pre-processing

In [0]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

#pre_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'),
#                        ce.BinaryEncoder())

In [0]:
#split train data into train and validation:

target = 'status_group'
X = train.drop(columns=target)
y = train[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, 
                                                  random_state=42, stratify=y)

In [0]:
#categorical NaNs, we'll fill NaNs w/ the mode for that feature:
cols = X_train.columns
imp = SimpleImputer(strategy='most_frequent')

X_train = pd.DataFrame(imp.fit_transform(X_train), columns=cols)
X_val = pd.DataFrame(imp.transform(X_val), columns=cols)

#restore dtypes:
for col in int_cols:
  X_train[col] = X_train[col].astype('int')
  X_val[col] = X_val[col].astype('int')  
    
for col in float_cols:
  X_train[col] = X_train[col].astype('float')
  X_val[col] = X_val[col].astype('float')
  
#convert date to datetime, split into day/month/year, and drop the dt col:
def date_recorded_wrangle(X):
  X = X.copy()
  X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
  X['date_recorded_month'] = X['date_recorded'].dt.month.astype('object')
  X['date_recorded_year'] = X['date_recorded'].dt.year.astype('object')
  X['date_recorded_day'] = X['date_recorded'].dt.day.astype('object')
  X = X.drop(columns='date_recorded')
  return X
  
X_train = date_recorded_wrangle(X_train)
X_val = date_recorded_wrangle(X_val)

#25-30 secs

In [0]:
#Binary categorical encoding due to high ordinality:

cat_cols = X_train.select_dtypes(['object']).columns.tolist()

binary_ce = ce.BinaryEncoder(cols=cat_cols, verbose=10, return_df=True)

X_train = binary_ce.fit_transform(X_train)
X_val = binary_ce.transform(X_val)

#10 secs

##Establish a baseline:

First, lets try Logistic Regression:

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1, 
                           max_iter=1000, random_state=42)
model.fit(X_train, y_train)
model.score(X_val, y_val)

0.553956228956229

Hmmm, not so great: lets try w/ a DecisionTreeClassifier:

In [17]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=15, random_state=42)
tree.fit(X_train, y_train)
tree.score(X_val, y_val)

0.7606060606060606

OK cool, 76% is a great baseline: lets try on the test data and submit to the leaderboard:


In [0]:
sample_submission = pd.read_csv(sample_submission_url)
sample_submission

In [0]:
#First impute our Nans on the test data:
X_test = pd.DataFrame(imp.transform(test), columns=cols)

#restore dtypes:
for col in int_cols:
  X_test[col] = X_test[col].astype('int')
    
for col in float_cols:
  X_test[col] = X_test[col].astype('float')

#wrangle the date_recorded col:  
X_test = date_recorded_wrangle(X_test)

#encode the categorical cols:
X_test = binary_ce.transform(X_test)

In [22]:
#check to confirm X_train and X_test have same shape:
X_test.shape, X_train.shape

((14358, 204), (47520, 204))

In [0]:
#use our tree to generate predictions using the X_test data:

y_pred = tree.predict(X_test)
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('baseline_DTC.csv', index=False)

In [0]:
from google.colab import files
files.download('baseline_DTC.csv') 

##RandomForest

In [87]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=30, n_estimators=100)
forest.fit(X_train, y_train)
forest.score(X_val, y_val)

0.8141414141414142

##XGBClassifier

In [91]:
import xgboost as xgb

xgb = xgb.XGBClassifier(verbosity=2, n_jobs=-1, random_state=42, n_estimators=50, max_depth=15, 
                        num_class=3, objective='multi:softmax', booster='gbtree')

xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='merror', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=15, maximize=False,
       min_child_weight=1, missing=None, n_estimators=50, n_jobs=-1,
       nrounds='min.error.idx', nthread=None, num_class=3,
       objective='multi:softprob', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, verbosity=2)

In [92]:
from sklearn.metrics import accuracy_score

xgb_y_val_pred = xgb.predict(X_val)
accuracy_score(y_val, xgb_y_val_pred)

0.80496632996633

##Submission #2: random_forest

In [0]:
forest_y_pred = forest.predict(X_test)
submission = sample_submission.copy()
submission['status_group'] = forest_y_pred
submission.to_csv('baseline_RF.csv', index=False)

In [0]:
from google.colab import files
files.download('baseline_RF.csv') 