# Preperation

## Import Libraries and Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, classification_report, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
labels = pd.read_csv('data/traininglabels.csv')
data = pd.read_csv('data/trainingdata.csv')

wellsdf = pd.merge(labels, data, on='id')

In [3]:
wells3 = pd.read_csv('data/wells3.csv', index_col = 0)

#### Convert Appropriate Numeric Columns to Categorical

In [4]:
wells3[['region_code', 'district_code', 'construction_year', 'year_recorded']] = wells3[['region_code', 'district_code', 'construction_year', 'year_recorded']].astype('str')

### Create Train Test Split

In [5]:
X = wells3.drop(['status_group'], axis=1)
y = wells3['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=52)

## Feature Engineering

### Subpipes & Column Transformer

In [6]:
# Create pipelines to properly scale / encode different data types for use in column transformer

subpipe_num = Pipeline(steps=[('ss', StandardScaler())])

subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

#subpipe_ord = Pipeline(steps=[('ord', OrdinalEncoder())])

In [7]:
# Create list of features for each subpipe
cat_feat = X_train.select_dtypes(include=['object']).columns
num_feat = X_train.select_dtypes(include=['float', 'int64']).columns


In [8]:
ct = ColumnTransformer(transformers = [
    ('subpipe_num', subpipe_num, num_feat),
    ('subpipe_cat', subpipe_cat, cat_feat),
    #('subpipe_ord', subpipe_ord, ord_feat)
])

# Modeling
Since water is so vital, the Tanzanian government wants to focus on identifying wells that need work.  The modeling process will aim to minimize the number of wells that are predicted to be functional, but actually need work (false positives).  Therefore precision will be used as the primary scoring metric with secondary consideration for accuracy.  

### Dummy Classifier

In [9]:
# Using a pipeline to maintain consistency with later models, dummy strategy of most frequent to establlish a baseline

dummy_pipe = Pipeline(steps=[
    ('ct', ct),
    ('dum', DummyClassifier(strategy='most_frequent', random_state=52))
])

In [10]:
dummy_pipe.fit(X_train, y_train)


Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'region_code', 'district_code', 'lga',
       'construction_year', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'paymen

In [11]:
precision_score(y_train, dummy_pipe.predict(X_train))

0.54341189674523

Unsurprisingly, the precision score for the dummy classifier is not very good.

### First Simple Model

In [12]:
dct_pipe = Pipeline(steps=[
    ('ct',ct),
    ('dct', DecisionTreeClassifier(random_state=52))
])

In [13]:
dct_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'region_code', 'district_code', 'lga',
       'construction_year', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'paymen

In [14]:
precision_score(y_train, dct_pipe.predict(X_train))

0.9971036080767958

In [15]:
cross_val_score(dct_pipe, X_train, y_train, scoring='precision', error_score='raise')

array([0.79907815, 0.78912685, 0.79284098, 0.80175843, 0.79881154])

#### Evaluation
The untuned decision tree model performs significantly better than the baseline, but not particularly great precision. FSM was useful in identifying that some categoricals were causing it to run very slowly. Eliminated some features with further EDA and reran FSM.

### Second Model

In [16]:
lr_pipe = Pipeline(steps=[
    ('ct',ct),
    ('lr', LogisticRegression(random_state=52, max_iter=1000))
    ])

In [17]:
lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'region_code', 'district_code', 'lga',
       'construction_year', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'paymen

In [18]:
# defaults and max iter=1000
cross_val_score(lr_pipe, X_train, y_train, scoring='precision', error_score='raise')

array([0.75867214, 0.75      , 0.75591716, 0.76375345, 0.76860422])

#### GridSearch 
Untuned LogReg underperformed it's DCT counterpart, trying Gridsearch to tune LogReg hyperparameters

In [19]:
params = {}
params['lr__solver'] = ['newton-cg', 'lbfgs', 'saga']
params['lr__C'] = [.25, .5, 1, 2]

In [22]:
gs = GridSearchCV(estimator=lr_pipe, param_grid=params, cv=5, n_jobs=-2, scoring='precision')

In [23]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                    

In [24]:
gs.cv_results_['mean_test_score']

array([0.75754806, 0.75754806, 0.75760927, 0.75870345, 0.75871283,
       0.75862959, 0.75942644, 0.75945407, 0.75922824, 0.7600126 ,
       0.759852  , 0.75971259])

In [25]:
gs.best_params_

{'lr__C': 2, 'lr__solver': 'newton-cg'}

#### Follow Up GridsearchCV
Initial tuning had limited success.  Will try taking the best params and tweaking other hyperparameters

In [26]:
params = {}
params['lr__solver'] = ['newton-cg']
params['lr__C'] = [1, 2, 10]
params['lr__penalty'] = ['l1', 'l2', 'none']

In [27]:
gs = GridSearchCV(estimator=lr_pipe, param_grid=params, cv=5, n_jobs=-2, scoring='precision')

In [28]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                    

In [29]:
gs.cv_results_['mean_test_score']

array([       nan, 0.75942644, 0.76019279,        nan, 0.7600126 ,
       0.76019279,        nan, 0.76022019, 0.76019279])