# Preperation

## Import Libraries and Data

In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, classification_report, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
labels = pd.read_csv('data/traininglabels.csv')
data = pd.read_csv('data/trainingdata.csv')

wellsdf = pd.merge(labels, data, on='id')

## Clean Data
Per the rational in EDA notebook

In [3]:
wells2 = wellsdf.copy()



wells2 = wells2.drop(['funder', 'installer', 'scheme_name', 'public_meeting', 'scheme_management',
                      'id', 'amount_tsh', 'num_private', 'permit', 'subvillage'], axis=1)


In [4]:
wells2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   status_group           59400 non-null  object 
 1   date_recorded          59400 non-null  object 
 2   gps_height             59400 non-null  int64  
 3   longitude              59400 non-null  float64
 4   latitude               59400 non-null  float64
 5   wpt_name               59400 non-null  object 
 6   basin                  59400 non-null  object 
 7   region                 59400 non-null  object 
 8   region_code            59400 non-null  int64  
 9   district_code          59400 non-null  int64  
 10  lga                    59400 non-null  object 
 11  ward                   59400 non-null  object 
 12  population             59400 non-null  int64  
 13  recorded_by            59400 non-null  object 
 14  construction_year      59400 non-null  int64  
 15  ex

#### Simplify and convert date_recorded to year_recorded

In [5]:
wells2.date_recorded = pd.to_datetime(wells2.date_recorded)
wells2['year_recorded'] = pd.DatetimeIndex(wells2['date_recorded']).year
wells2.drop('date_recorded', axis=1, inplace=True)

##### Construction year
Remove rows with zero, average the remainder, set zeros in wells2 to that value

In [6]:
con_year = wells2.drop(wells2[wells2.construction_year == 0].index) 
con_avg = con_year.construction_year.mean().round(0)

wells2.construction_year.replace(0, con_avg, inplace=True)

#### Convert Appropriate Numeric Columns to Categorical

In [7]:
wells2[['region_code', 'district_code', 'construction_year', 'year_recorded']] = wells2[['region_code', 'district_code', 'construction_year', 'year_recorded']].astype('str')

#### Lat & Long
The default value for longitude (0) is outside of Tanzania (40°29' E to 29°10' E), so it will be replaced by the mean longitude from the dataset.  While the default value for latitude is possible, it may skew our data more north (-2), so it will be replaced with the mean latitude.

In [8]:
longitude = wells2.drop(wells2[wells2.longitude == 0].index) 
long_avg = longitude.longitude.mean()
wells2.longitude.replace(0, long_avg, inplace=True)

latitude = wells2.drop(wells2[wells2.latitude == -2.000000e-08].index) 
lat_avg = latitude.latitude.mean()
wells2.latitude.replace(-2.000000e-08, lat_avg, inplace=True)

#### Convert target to binary
To meet the business understanding of wells that require an engineer's evaluation, "functional needs repair" and "non functional" will be combined into "needs work"

In [9]:
wells2.status_group = wells2.status_group.replace({'functional needs repair':'needs work',
                                                  'non functional':'needs work'})

In [10]:
wells2.status_group = wells2.status_group.map({'functional': 1, 'needs work': 0})

### Create Train Test Split

In [11]:
X = wells2.drop(['status_group'], axis=1)
y = wells2['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=52)

## Feature Engineering

### Subpipes & Column Transformer

In [12]:
# Create pipelines to properly scale / encode different data types for use in column transformer

subpipe_num = Pipeline(steps=[('ss', StandardScaler())])

subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

#subpipe_ord = Pipeline(steps=[('ord', OrdinalEncoder())])

In [13]:
# Create list of features for each subpipe
cat_feat = X_train.select_dtypes(include=['object']).columns
num_feat = X_train.select_dtypes(include=['float', 'int64']).columns


In [14]:
ct = ColumnTransformer(transformers = [
    ('subpipe_num', subpipe_num, num_feat),
    ('subpipe_cat', subpipe_cat, cat_feat),
    #('subpipe_ord', subpipe_ord, ord_feat)
])

# Modeling
Since water is so vital, the Tanzanian government wants to focus on identifying wells that need work.  The modeling process will aim to minimize the number of wells that are predicted to be functional, but actually need work (false positives).  Therefore precision will be used as the primary scoring metric with secondary consideration for accuracy.  

### Dummy Classifier

In [15]:
# Using a pipeline to maintain consistency with later models, dummy strategy of most frequent to establlish a baseline

dummy_pipe = Pipeline(steps=[
    ('ct', ct),
    ('dum', DummyClassifier(strategy='most_frequent', random_state=52))
])

In [16]:
dummy_pipe.fit(X_train, y_train)


Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['wpt_name', 'basin', 'region', 'region_code', 'district_code', 'lga',
       '...,
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_qualit

In [17]:
precision_score(y_train, dummy_pipe.predict(X_train))

0.54341189674523

Unsurprisingly, the precision score for the dummy classifier is not very good.

### First Simple Model

In [18]:
dct_pipe = Pipeline(steps=[
    ('ct',ct),
    ('dct', DecisionTreeClassifier(random_state=52))
])

In [19]:
dct_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['wpt_name', 'basin', 'region', 'region_code', 'district_code', 'lga',
       '..., 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_t

In [20]:
precision_score(y_train, dct_pipe.predict(X_train))

0.9999586845149562

In [22]:
cross_val_score(dct_pipe, X_train, y_train, scoring='precision', error_score='raise')

array([0.79899497, 0.79843562, 0.79498906, 0.80115676, 0.79644781])

#### Evaluation
The untuned decision tree model performs significantly better than the baseline, but not particularly great precision and very memory intensive.  May need to reduce number of features.

### Second Model

In [27]:
lr_pipe = Pipeline(steps=[
    ('ct',ct),
    ('lr', LogisticRegression(random_state=52, max_iter=1000))
])

In [28]:
lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'population'], dtype='object')),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['wpt_name', 'basin', 'region', 'region_code', 'district_code', 'lga',
       '...ion_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'wat

In [29]:
cross_val_score(lr_pipe, X_train, y_train, scoring='precision', error_score='raise')

array([0.78467153, 0.77428464, 0.7779879 , 0.78575477, 0.78854626])