In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from eli5.sklearn import PermutationImportance
import eli5

pd.set_option('display.max_columns', None)

In [2]:
# used to ignore warnings

import warnings; warnings.simplefilter('ignore')

In [3]:
test0 = pd.read_csv('https://raw.githubusercontent.com/SeanAntosiak/DS-Unit-2-Regression-Classification/master/data/tanzania/test_features.csv')
train0 = pd.read_csv('https://raw.githubusercontent.com/SeanAntosiak/DS-Unit-2-Regression-Classification/master/data/tanzania/train_features.csv')
trainLabels = pd.read_csv('https://raw.githubusercontent.com/SeanAntosiak/DS-Unit-2-Regression-Classification/master/data/tanzania/train_labels.csv')
sample = pd.read_csv('https://raw.githubusercontent.com/SeanAntosiak/DS-Unit-2-Regression-Classification/master/data/tanzania/sample_submission.csv')

In [4]:
# looks at columns

train0.head(1)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [5]:
# checks if quantity column is a duplicate
# since there are no rows where the two are not equal
# might remove one column if needed

train0[train0['quantity']!=train0['quantity_group']][['quantity','quantity_group']].head()

Unnamed: 0,quantity,quantity_group


In [6]:
# checks if payment column is a duplicate
# it is not but they convey the same information 
# may remove a column if needed

train0[train0['payment']!=train0['payment_type']][['payment','payment_type']].head()

Unnamed: 0,payment,payment_type
0,pay annually,annually
2,pay per bucket,per bucket
5,pay per bucket,per bucket
11,pay when scheme fails,on failure
13,pay monthly,monthly


In [7]:
# only one value, might remove column

train0['recorded_by'].value_counts()

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64

In [8]:
# check what percentage of pumps work

trainLabels['status_group'].value_counts(normalize=True)

# since ~54.3% of pumps work, a baseline model of always guessing yes would have an accuracy of 54.3%

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [9]:
# checks number of unique values for each feature

dfUnique0 = pd.DataFrame()
dfUnique0['dtypes'] = train0.dtypes
dfUnique0['unique_values'] = train0.nunique()
dfUnique0

Unnamed: 0,dtypes,unique_values
id,int64,59400
amount_tsh,float64,98
date_recorded,object,356
funder,object,1897
gps_height,int64,2428
installer,object,2145
longitude,float64,57516
latitude,float64,57517
wpt_name,object,37400
num_private,int64,65


In [10]:
# creates train and test split

Xtrain0, Xval0, ytrain0, yval0 = tts(train0, trainLabels['status_group'], train_size=0.85, test_size=0.15, random_state=8)

In [11]:
# checks all columns

Xtrain0.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [12]:
# creates a function to wrangle an input dataframe and returns wrangled dataframe

def wrangle(DF):
    
    # creates a copy of the input dataframe
    df = DF.copy()
                    
    # converts date_recorded to a year
    df['date_recorded'] = pd.to_datetime(df['date_recorded'], infer_datetime_format=True).dt.year
    
    # defines the columns which apear to have missing values input as 0
    zeroCols = ['amount_tsh', 'longitude', 'latitude', 'gps_height', 'construction_year']
    
     # replaces tiny latitude values with 0 to be replaced with nan in the next step 
    df['latitude']=df['latitude'].replace(-2e-8,0)
    
    # replaces missing 0 values with nan
    for col in zeroCols:
        df[col] = df[col].replace(0, np.nan)
        df[col+'_missing']=df[col].isnull();
       
        
    return(df);

In [33]:
# applies wrangle function to each dataframe

Xtrain = wrangle(Xtrain0)
Xval = wrangle(Xval0)
Xtest = wrangle(test0)

In [56]:
# creates a  pipeline to transform data and then makes a Gradiant Boosting model

pipe = make_pipeline(
        ce.OrdinalEncoder(),
        SimpleImputer(strategy='median'),
        MinMaxScaler() 
        )

XtrainTrans = pipe.fit_transform(Xtrain)
XvalTrans = pipe.transform(Xval)


mod = XGBClassifier(n_estimators=1000, random_state=8, n_jobs=-1)
mod.fit(XtrainTrans, ytrain0)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=-1, nthread=None,
       objective='multi:softprob', random_state=8, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [59]:
permuter = PermutationImportance(
            mod, scoring='accuracy', n_iter=2, random_state=8
            )

permuter.fit(XvalTrans, yval0)

PermutationImportance(cv='prefit',
           estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=-1, nthread=None,
       objective='multi:softprob', random_state=8, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
           n_iter=2, random_state=8, refit=True, scoring='accuracy')

In [60]:
# shows importance of all features

eli5.show_weights(permuter, top=None, feature_names=Xval.columns.tolist())

Weight,Feature
0.0973  ± 0.0013,quantity
0.0182  ± 0.0007,construction_year
0.0176  ± 0.0010,longitude
0.0113  ± 0.0038,waterpoint_type
0.0111  ± 0.0008,waterpoint_type_group
0.0110  ± 0.0013,latitude
0.0105  ± 0.0007,extraction_type_class
0.0105  ± 0.0033,payment
0.0104  ± 0.0036,source
0.0095  ± 0.0011,extraction_type


In [62]:
pipe2 = make_pipeline(
            make_pipeline(
            ce.OrdinalEncoder(),
            SimpleImputer(strategy='median'),
            MinMaxScaler() 
            ),
        XGBClassifier(n_estimators=1000, random_state=8, n_jobs=-1)
        )

In [63]:
pipe2.fit(Xtrain, ytrain0)

pipe2.score(Xval, yval0)

0.7836139169472502

In [None]:
# creates predictions from test data using pipe2 and entire training data set

pipe2.fit(wrangle(train0), trainLabels['status_group'])
pred4 = pipe2.predict(wrangle(test0))

In [None]:
# creating a submission dataframe

submission4 = sample.copy()
submission4['status_group'] = pred4

In [None]:
# creates submission file for challenge

submission4.to_csv('submission4.csv', index=False)