## 5. CRISP-DM: Modeling

In [1]:

# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from IPython.display import display
import re
import os
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from IPython.display import display, Markdown
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler


from imblearn.over_sampling import SMOTE 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, confusion_matrix, recall_score, precision_score, roc_curve, f1_score

%matplotlib inline
%config InlineBackend.figure_format = 'retina' # Render high resolution images

# Load datasets
train = pd.read_csv('../data/cleaned_train.csv')
test = pd.read_csv('../data/cleaned_test.csv')
weather = pd.read_csv('../data/cleaned_weather.csv')
spray = pd.read_csv('../data/cleaned_spray.csv')


### Model Preparation

In [2]:
# create features matrix (X)
X = train.drop(["WnvPresent"], axis=1)

# create target vector (y)
y = train["WnvPresent"]
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,stratify=y, random_state=42)


In [3]:
def run_model(mod, mod_params={}, grid_search=False):
    
    # Initial dictionary to hold model results
    results = {}
    
    pipe = Pipeline([
            ('ss', StandardScaler()),
            (mod, models[mod])
            ])
    
    if grid_search:
        # Instantiate list to store gridsearch results
        gs = GridSearchCV(pipe, param_grid=mod_params, cv=3, verbose=1, scoring='roc_auc', n_jobs=-1)
        gs.fit(X_train, y_train)
        pipe = gs
    else:
        pipe.fit(X_train, y_train)
        

    # Retrieve metrics
    predictions = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    y_test_pred_prob = pipe.predict_proba(X_test)[:,1]
    y_train_pred_prob = pipe.predict_proba(X_train)[:,1]
    auc_scores = cross_val_score(pipe, X_train, y_train,  scoring='roc_auc', cv = 5)

    results['model'] = mod
    
    results['train_auc_cv'] = auc_scores.mean()
    
    results['f1'] = f1_score(y_test, predictions)
    results['recall'] = recall_score(y_test, predictions)        # % OF ACTUAL positives that are CORRECTLY predicted
    results['precision'] = precision_score(y_test, predictions)  # % OF positives that are CORRECTLY predicted

    results['train_auc'] = roc_auc_score(y_train, y_train_pred_prob)
    results['test_auc'] = roc_auc_score(y_test, y_test_pred_prob)
    results['auc_diff'] = results['train_auc'] - results['test_auc']

    if grid_search:
        gs_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe.best_params_)
        
    else:
        init_list.append(results)

    return pipe

In [4]:
X_train_copy = X_train.astype(float).copy()
y_train_copy = y_train.astype(float).copy()


# create loop to run SMOTE sampling and compare the modelling outcomes with and without it
for k in ['No', 'SMOTE']:
    print('\nMethod Used: {}'.format(k + ' sampling'), "-" * 100)
        
    print('\nClass Balance BEFORE')
    display(y_train.value_counts(normalize=True))
    print('Number of rows: {}'.format(y_train.shape[0]))
  
    # instiantiate the models
    methods = {'SMOTE': SMOTE(random_state=42)}
    
    if k == 'SMOTE':
        mthd = methods[k]
        X_train, y_train = mthd.fit_resample(X_train, y_train)

    print('\nClass Balance AFTER')
    display(y_train.value_counts(normalize=True))
    print('Number of rows: {}'.format(y_train.shape[0]),'\n')
    
    
    # Instiantiate models
    models = {
              'Decision Tree': DecisionTreeClassifier(random_state=42),
            }

    # Instantiate lists to store results
    init_list = []
    gs_list = []

    for m in models:
        run_model(m)
    result_df = pd.DataFrame(init_list).sort_values(by=["f1"], ascending=False).reset_index(drop=True)
    display(result_df)

    X_train = X_train_copy
    y_train = y_train_copy


Method Used: No sampling ----------------------------------------------------------------------------------------------------

Class Balance BEFORE


WnvPresent
0    0.947674
1    0.052326
Name: proportion, dtype: float64

Number of rows: 6727

Class Balance AFTER


WnvPresent
0    0.947674
1    0.052326
Name: proportion, dtype: float64

Number of rows: 6727 



  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,train_auc_cv,f1,recall,precision,train_auc,test_auc,auc_diff
0,Random Forest,0.750193,0.151515,0.099338,0.319149,0.944724,0.775695,0.169029
1,Decision Tree,0.69322,0.125,0.07947,0.292683,0.945522,0.745958,0.199564
2,Logistic Regression,0.832422,0.0,0.0,0.0,0.843642,0.84958,-0.005938



Method Used: SMOTE sampling ----------------------------------------------------------------------------------------------------

Class Balance BEFORE


WnvPresent
0.0    0.947674
1.0    0.052326
Name: proportion, dtype: float64

Number of rows: 6727

Class Balance AFTER


WnvPresent
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64

Number of rows: 12750 



Unnamed: 0,model,train_auc_cv,f1,recall,precision,train_auc,test_auc,auc_diff
0,Decision Tree,0.976491,0.281609,0.324503,0.248731,0.993374,0.745284,0.24809
1,Random Forest,0.98304,0.276836,0.324503,0.241379,0.992983,0.771212,0.221771
2,Logistic Regression,0.858345,0.243062,0.84106,0.142058,0.860787,0.847755,0.013032


### 5.1 Logistic Regression

### 5.1 Model Selection

#### Random Trees

#### Decision Trees

#### Logistic Regression

### Decision Tree