In [26]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [2]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')

In [3]:
drop_cols = ['subvillage',
 'region',
 'lga',
 'ward',
 'extraction_type_group',
 'extraction_type_class',
 'source_type',
 'waterpoint_type_group',
 'scheme_name',
 'payment',
 'quantity_group',
 'waterpoint_type_group',
 'recorded_by',
 'num_private',
 'id']

In [4]:
df = pd.concat([X.drop(columns=drop_cols), y.drop(columns='id')], axis=1)

In [5]:
df.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,region_code,...,management,management_group,payment_type,water_quality,quality_group,quantity,source,source_class,waterpoint_type,status_group
0,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,Lake Nyasa,11,...,vwc,user-group,annually,soft,good,enough,spring,groundwater,communal standpipe,functional
1,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,Lake Victoria,20,...,wug,user-group,never pay,soft,good,insufficient,rainwater harvesting,surface,communal standpipe,functional
2,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,Pangani,21,...,vwc,user-group,per bucket,soft,good,enough,dam,surface,communal standpipe multiple,functional
3,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,90,...,vwc,user-group,never pay,soft,good,dry,machine dbh,groundwater,communal standpipe multiple,non functional
4,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,Lake Victoria,18,...,other,other,never pay,soft,good,seasonal,rainwater harvesting,surface,communal standpipe,functional


In [28]:
predictors = df.drop('status_group', axis = 1)
target = df['status_group']
target = LabelEncoder().fit_transform(target)


#### 0 = `functional`, 1 = `functional needs repair`, 2 = `non functional` 

In [29]:
pd.Series(target).value_counts(1)

0    0.543081
2    0.384242
1    0.072677
dtype: float64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=42)

### Using `Pipeline` and `FunctionTransformer`

We'll start off with a sumple pipeline

In [31]:
def grab_numeric(predictors):
    return predictors.select_dtypes(include=['float', 'int'])

In [32]:
# The FunctionTransformer will turn my function
# into a transformer.

GrabNumeric = FunctionTransformer(grab_numeric)

In [33]:
pipe = Pipeline(steps=[('num', GrabNumeric),
                       ('ss', StandardScaler())])

In [34]:
pipe.fit(X_train)

Pipeline(steps=[('num',
                 FunctionTransformer(func=<function grab_numeric at 0x12957b790>)),
                ('ss', StandardScaler())])

In [35]:
pipe.transform(X_train)

array([[-0.09307351, -0.48786917,  0.61641117, ..., -0.47999785,
        -0.20973197,  0.7029537 ],
       [-0.09914958, -0.96404191, -5.22177795, ..., -0.47999785,
        -0.37956688, -1.37144446],
       [-0.09914958, -0.96404191, -0.12041266, ..., -0.17040775,
        -0.37956688, -1.37144446],
       ...,
       [-0.09914958, -0.98280023,  0.74666629, ..., -0.06721105,
         1.74336955,  0.73766726],
       [-0.09914958, -0.96404191,  0.03337799, ...,  0.03598565,
        -0.37956688, -1.37144446],
       [-0.09914958,  0.89736062,  0.54023468, ..., -0.47999785,
        -0.37744395,  0.7071614 ]])

## Using `Pipeline` and `ColumnTransformer`

In [36]:
# We'll throw these mini-pipelines into our ColumnTransformer.

subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer()),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [40]:
num_features = list(predictors.select_dtypes(include=['float', 'int']))
cat_features = list(predictors.select_dtypes(include=['object']))
print('num_features:')
print(num_features)
print(' ')
print('cat_features:')
print(cat_features)

num_features:
['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'construction_year']
 
cat_features:
['date_recorded', 'funder', 'installer', 'wpt_name', 'basin', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'management_group', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_class', 'waterpoint_type']


In [41]:
# The "remainder='passthrough'" bit tells the compiler to leave
# the other df columns unchanged.

CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, num_features),
                                         ('subpipe_cat', subpipe_cat, cat_features)],
                           remainder='passthrough')

In [42]:
# The `ColumnTransformer` will take care of our preprocessing,
# so now we can add our model at the end of the pipeline.

logreg_model_pipe = Pipeline(steps=[('ct', CT),
                            ('logreg', LogisticRegression(random_state=42))])

In [None]:
logreg_model_pipe.fit(X_train, y_train)

In [None]:
logreg_model_pipe.score(X_train, y_train)

In [None]:
#### Modeling Class

In [None]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [None]:
log_pipe = ModelWithCV(logreg_model_pipe, model_name='log_pipe', X=X_train, y=y_train)

In [None]:
fig, ax = plt.subplots()

log_pipe.plot_cv(ax=ax);

## Random Forest

In [None]:
rfc = RandomForestClassifier(random_state=42)

rfc_model_pipe = Pipeline([('ct', CT), ('rfc', rfc)])

In [None]:
rfc_model_pipe.fit(X_train, y_train)

In [None]:
rfc_model_pipe.score(X_train, y_train)

In [None]:
forest_pipe = ModelWithCV(model=rfc_model_pipe,
                          model_name='forest_pipe',
                          X=X_train,
                          y=y_train)

In [None]:
fig, ax = plt.subplots()

forest_pipe.plot_cv(ax=ax);

In [None]:
## Gradient Booster

In [None]:
gbc_model_pipe = Pipeline([('ct', CT), ('gbc', GradientBoostingClassifier(random_state=42))])

In [None]:
gbc_model_pipe.fit(X_train, y_train)

In [None]:
gbc_model_pipe.score(X_train, y_train)

In [None]:
boost_pipe = ModelWithCV(model=gbc_model_pipe,
                         model_name='boost_pipe',
                         X=X_train,
                         y=y_train)

In [None]:
fig, ax = plt.subplots()

boost_pipe.plot_cv(ax=ax);

## Tuning and Cross-Validating

In [None]:
params = {
    'rfc__criterion' : ['gini', 'entropy'],
    'rfc__min_samples_leaf' : [1, 5, 10]
}


gs = GridSearchCV(estimator=rfc_model_pipe,
                 param_grid=params,
                 cv=10)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
# Mean test score for each of six models

gs.cv_results_['mean_test_score']

## `imblearn` Pipelines

#### Dealing with Target Imbalance

In [None]:
y_train.value_counts()

In [None]:
sm = SMOTE(sampling_strategy='auto', random_state=42)

In [None]:
sm2 = SMOTE(sampling_strategy=0.8, random_state=42)

In [None]:
X_train_clean = X_train.select_dtypes(['float', 'int']).dropna()
y_train_clean = y_train[X_train_clean.index]

In [None]:
y_train_clean.value_counts()

In [None]:
# Even distribution

X_clean_resmp, y_clean_resmp = sm.fit_resample(X_train_clean, y_train_clean)

y_clean_resmp.value_counts()

In [None]:
# Distribution where count of 1's = 0.8 * count of 0's

X_clean_resmp2, y_clean_resmp2 = sm2.fit_resample(X_train_clean, y_train_clean)

y_clean_resmp2.value_counts()

In [None]:
imb_pipe = ImPipeline(steps=[('ct', CT),
                             ('sm', SMOTE(random_state=42)),
                            ('rfc', RandomForestClassifier(random_state=42))])

In [None]:
imb_pipe.fit(X_train, y_train)

In [None]:
imb_pipe.score(X_train, y_train)

In [None]:
## Gridsearching

In [None]:
parameters = {'rfc__criterion': ['gini', 'entropy'],
          'rfc__min_samples_leaf': [1, 5, 10],
          'sm__k_neighbors': [3, 5, 9]}

gs = GridSearchCV(estimator=imb_pipe,
                 param_grid=parameters,
                 cv=10)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
### Evaluation on Test Set

In [None]:
final_model = imb_pipe

In [None]:
plot_confusion_matrix(final_model, X_test, y_test);

In [None]:
y_hat = final_model.predict(X_test)

In [None]:
print(f"""
Our final model's accuracy on the test set is {round(accuracy_score(y_test, y_hat), 2)}. \n
Our final model's recall on the test set is {round(recall_score(y_test, y_hat), 2)} \n
Our final model's precision on the test set is {round(precision_score(y_test, y_hat), 2)} \n
Our final model's f1-score on the test is {round(f1_score(y_test, y_hat), 2)}.
""")