In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix, recall_score
    accuracy_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
#import ourfunctions as func

In [2]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')

In [3]:
drop_cols = ['subvillage',
 'region',
 'lga',
 'ward',
 'extraction_type_group',
 'extraction_type_class',
 'source_type',
 'waterpoint_type_group',
 'scheme_name',
 'payment',
 'quantity_group',
 'waterpoint_type_group',
 'recorded_by',
 'num_private',
 'id']

In [4]:
df = pd.concat([X.drop(columns=drop_cols), y.drop(columns='id')], axis=1)

In [5]:
predictors = df.drop('status_group', axis = 1)
target = df['status_group']
target = LabelEncoder().fit_transform(target)


In [6]:
pd.Series(target).value_counts(1)

0    0.543081
2    0.384242
1    0.072677
dtype: float64

In [10]:
num_features = list(predictors.select_dtypes(include=['float', 'int']))
cat_features = list(predictors.select_dtypes(include=['object']))
print('num_features:')
print(num_features)
print(' ')
print('cat_features:')
print(cat_features)

num_features:
['amount_tsh', 'gps_height', 'longitude', 'latitude', 'region_code', 'district_code', 'population', 'construction_year']
 
cat_features:
['date_recorded', 'funder', 'installer', 'wpt_name', 'basin', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'management_group', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_class', 'waterpoint_type']


In [9]:
def model_info(model, X_test, y_test):
    ''' Model information is used to create a classification report, 
    confusion matrix, roc curve, and when needed a feature importance
    visualization. 
    
    Args:
       model: model used
       X_test: x test
       y_test: y test
       
    Returns:
        Classification Report
        Confusion Matrix
        ROC Curve
        Feature Importance
        
    Example: model_info(decision_tree, X_test, y_test)'''

    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    print("Testing Accuracy for Classifier: {:.4}%".format(
    accuracy_score(y_test, predict) * 100))
    print('---' * 20)

    #classification report
    print('CLASSIFICATION REPORT')
    print('---' * 20)
    print(classification_report(y_test, predict, target_names
                                = ["Functional", "Functional, needs repair", 
                                   "Not functional"]))
    
    print('---' * 20)
    with plt.style.context('seaborn-notebook'):
        fig, axes = plt.subplots(figsize = (8,8), nrows = 2)
        sns.set_palette("Blues")

        #confusion matrix
        conf = plot_confusion_matrix(model, X_test, y_test, cmap = plt.cm.Blues, 
                                     display_labels = ["Functional",
                                                       "Functional, needs repair",
                                                       "Not functional"], ax=axes[0],
                                    normalize = 'true');
        conf.ax_.set_title('CONFUSION MATRIX');

        #roc curve need to use Yellow Brick
        visualizer = ROCAUC(model, classes=["Functional", 
                                               "Functional, needs repair", 
                                               "Not functional"], ax=axes[1])

        visualizer.fit(X_train, y_train)        
        visualizer.score(X_test, y_test) 
        axes[1].plot([0,1],[0,1],ls=':')
        axes[1].grid()
        axes[1].set_title('ROC CURVE')
        fig.tight_layout()
        visualizer.show()

        #adding feature importance 
        try:
            print('---' *20)
            #print('FEATURE IMPORTANCE')
            #feature importance
            feat_import = pd.Series(model.feature_importances_,index=X_train.columns)
            feat_import.sort_values().tail(20).plot(kind='barh').set_title('FEATURE IMPORTANCE')
        except:
            pass

In [11]:
encoded_cat = pd.get_dummies(df, columns = cat_features)
encoded_cat

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,status_group,date_recorded_2002-10-14,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,6000.0,1390,34.938093,-9.856322,11,5,109,1999,functional,0,...,1,0,0,0,1,0,0,0,0,0
1,0.0,1399,34.698766,-2.147466,20,2,280,2010,functional,0,...,0,1,0,0,1,0,0,0,0,0
2,25.0,686,37.460664,-3.821329,21,4,250,2009,functional,0,...,0,1,0,0,0,1,0,0,0,0
3,0.0,263,38.486161,-11.155298,90,63,58,1986,non functional,0,...,1,0,0,0,0,1,0,0,0,0
4,0.0,0,31.130847,-1.825359,18,1,0,0,functional,0,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,10.0,1210,37.169807,-3.253847,3,5,125,1999,functional,0,...,1,0,0,0,1,0,0,0,0,0
59396,4700.0,1212,35.249991,-9.070629,11,4,56,1996,functional,0,...,0,1,0,0,1,0,0,0,0,0
59397,0.0,0,34.017087,-8.750434,12,7,0,0,functional,0,...,1,0,0,0,0,0,0,1,0,0
59398,0.0,0,35.861315,-6.378573,1,4,0,0,functional,0,...,1,0,0,0,0,0,0,1,0,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(encoded_cat, target, random_state = 42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(44550, 41913) (44550,)
(14850, 41913) (14850,)


### Decision Tree

In [None]:
#instantiate vanilla decision tree
tree_clf = DecisionTreeClassifier()

#fit decision tree
tree_clf.fit(X_train, y_train)

In [None]:
#test set predictions
predict = tree_clf.predict(X_test)

#print conf matrix & class report
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

In [None]:
model_info(tree_clf, X_test, y_test)

In [None]:
mean_cv_score_tree_clf = cross_val_score(tree_clf_cv, X_train, y_train, cv=3).mean()

print(f"Mean Cross Validation Score for Decision Tree: {mean_cv_score_tree_clf :.2%}")


#### GridSearchCV Decision Tree

In [None]:
#adjusted tree_clf many times for best results
tree_clf_gs = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [90, 100],
    'min_samples_split': [2, 3],
    'class_weight': ['balanced']
}

gs_tree = GridSearchCV(tree_clf_gs, param_grid, cv = 3)
gs_tree.fit(X_train, y_train)
print(f"Training Accuracy: {gs_tree.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {gs_tree.best_params_}")

In [None]:
#Instantiate Decision Tree Classifier
tree_clf_cv = DecisionTreeClassifier(criterion= 'entropy', max_depth= 100, 
                                     min_samples_split= 2, class_weight = 'balanced' )

In [None]:
model_info(tree_clf_cv, X_test, y_test)

In [None]:
mean_cv_score_tree_clf_cv = cross_val_score(tree_clf_cv, X_train, y_train, cv=3).mean()

print(f"Mean Cross Validation Score for GridSearchCV Decision Tree: {mean_cv_score_tree_clf_cv :.2%}")


### Bagged Tree

In [None]:
#instantiate bagged tree classifier
bg_tree = BaggingClassifier(DecisionTreeClassifier(criterion= 'entropy', max_depth= 100, 
                                     min_samples_split= 2, class_weight = 'balanced' ))

In [None]:
#run model w/o feat importance
model_info(bg_tree, X_test, y_test)

In [None]:
mean_cv_score_bg_tree = cross_val_score(bg_tree, X_train, y_train, cv=3).mean()

print(f"Mean Cross Validation Score for Bagged Tree: {mean_cv_score_bg_tree :.2%}")


### Naive Bayes

In [None]:
nb = GaussianNB()

In [None]:
model_info(nb, X_test, y_test)

In [None]:
mean_cv_score_nb = cross_val_score(nb, X_train, y_train, cv=3).mean()

print(f"Mean Cross Validation Score for Bagged Tree: {mean_cv_score_nb :.2%}")


### Random Forest

In [None]:
#instantiate and fit rf
rf = RandomForestClassifier()

In [None]:
model_info(rf, X_test, y_test)

In [None]:
mean_cv_score_rf = cross_val_score(rf, X_train, y_train, cv=3).mean()

print(f"Mean Cross Validation Score for Bagged Tree: {mean_cv_score_rf :.2%}")


#### GridSearchCV Random Forest

In [None]:
#set params to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [65, 70],
    'min_samples_split': [2, 3, 4],
    'class_weight': ['balanced']
}

gs_rf = GridSearchCV(rf, param_grid, cv = 3)
gs_rf.fit(X_train, y_train)
print(f"Training Accuracy: {gs_rf.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {gs_rf.best_params_}")

In [None]:
#instantiate rf classifier
rf1 = RandomForestClassifier(criterion = 'gini', max_depth= 65, 
                            min_samples_split= 3, class_weight= 'balanced')

In [None]:
model_info(rf1, X_test, y_test)

In [None]:
mean_cv_score_rf1 = cross_val_score(rf1, X_train, y_train, cv=3).mean()

print(f"Mean Cross Validation Score for Bagged Tree: {mean_cv_score_rf1 :.2%}")


In [None]:
### AdaBoost

In [None]:
ab = AdaBoostClassifier()

In [None]:
model_info(ab, X_test, y_test)

#### GridSearchCV Adaboost

In [None]:
#set params to search
param_grid = {
    'n_estimators': [50, 60, 70],
    'learning_rate': [.1, 1],
}

gs_adab = GridSearchCV(adab, param_grid, cv = 3)
gs_adab.fit(X_train, y_train)
print(f"Training Accuracy: {gs_adab.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {gs_adab.best_params_}")

In [None]:
#instantiate ab1 classifier
ab1 = AdaBoostClassifier(learning_rate =1, n_estimators = 70)

In [None]:
model_info(ab1, X_test, y_test)