In [11]:
import numpy as np
import pandas as pd

In [12]:
df2 = pd.read_csv('PreprocessedData.csv')

In [13]:
df2.shape

(32537, 36)

### X,y split

In [14]:
X = df2.iloc[:,0:35]
Y = df2.iloc[:,35]

print(X.shape,Y.shape)

(32537, 35) (32537,)


### Train Test split

In [15]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,random_state=42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(26029, 35) (6508, 35) (26029,) (6508,)


## Random Over Sampling

In [16]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(0.9)

X_train_os,Y_train_os = os.fit_resample(X_train,Y_train)
print(f"the number of classes before fit {Counter(Y_train)}")
print(f"the number of classes after fit {Counter(Y_train_os)}")



the number of classes before fit Counter({0: 19793, 1: 6236})
the number of classes after fit Counter({0: 19793, 1: 17813})


# ----------------------------------------------------------------------------------------------------------------

# Model Building

In [17]:
Model_summary = pd.DataFrame(columns=['Models', 'Accuracy_Score', '0_precision', '0_recall', '0_f1_score', '1_precision', '1_recall', '1_f1_score','AUC_score'])

In [18]:
def model_report(model,model_name, X_test, Y_test, y_pred):
    """
    Record model performace data and return it.
    
    parameters:
    -------------------------------
    model: The ML model in use
    model_name: Name of the algorithm
    X_test: The validation features
    Y_test: The gound truth data
    y_pred: The predicted data
    """
    temp = pd.DataFrame()
    
    temp['Models'] = [model_name]
    temp['Accuracy_Score'] = [accuracy_score(Y_test,y_pred)]
    temp['0_precision'] = [classification_report(Y_test,y_pred,output_dict=True)['0']['precision']]
    temp['0_recall'] = [classification_report(Y_test,y_pred,output_dict=True)['0']['recall']]
    temp['0_f1_score'] = [classification_report(Y_test,y_pred,output_dict=True)['0']['f1-score']]
    temp['1_precision'] = [classification_report(Y_test,y_pred,output_dict=True)['1']['precision']]
    temp['1_recall'] = [classification_report(Y_test,y_pred,output_dict=True)['1']['recall']]
    temp['1_f1_score'] = [classification_report(Y_test,y_pred,output_dict=True)['1']['f1-score']]
    temp['AUC_score'] = [roc_auc_score(Y_test,model.predict_proba(X_test)[:, 1])]
    
    return temp

## Logistic Regression

In [19]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

X_scaler = scaler.fit_transform(X_train)
X_scaler_test = scaler.transform(X_test)

In [20]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression(max_iter=10000,class_weight='balanced')

In [21]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

LR_model.fit(X_scaler,Y_train)
y_pred = LR_model.predict(X_scaler_test)

print('Accuracy:',accuracy_score(Y_test,y_pred))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred))
print('classification report:\n',classification_report(Y_test,y_pred))
print('model predictive power: ',roc_auc_score(Y_test,LR_model.predict_proba(X_scaler_test)[:, 1]))

Accuracy: 0.78779963122311
confusion matrix:
 [[3788 1117]
 [ 264 1339]]
classification report:
               precision    recall  f1-score   support

           0       0.93      0.77      0.85      4905
           1       0.55      0.84      0.66      1603

    accuracy                           0.79      6508
   macro avg       0.74      0.80      0.75      6508
weighted avg       0.84      0.79      0.80      6508

model predictive power:  0.8840395461364172


In [22]:
## Logging model report

temp = model_report(LR_model,'Logistic Regression', X_scaler_test, Y_test, y_pred)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

### Over Sample X_train and Y_train

In [23]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

X_scaler_os = scaler.fit_transform(X_train_os)
X_scaler_test = scaler.transform(X_test)

In [24]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

LR_model.fit(X_scaler_os,Y_train_os)
y_pred_os = LR_model.predict(X_scaler_test)

print('Accuracy:',accuracy_score(Y_test,y_pred_os))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred_os))
print('classification report:\n',classification_report(Y_test,y_pred_os))
print('model predictive power: ',roc_auc_score(Y_test,LR_model.predict_proba(X_scaler_test)[:, 1]))

Accuracy: 0.785341118623233
confusion matrix:
 [[3773 1132]
 [ 265 1338]]
classification report:
               precision    recall  f1-score   support

           0       0.93      0.77      0.84      4905
           1       0.54      0.83      0.66      1603

    accuracy                           0.79      6508
   macro avg       0.74      0.80      0.75      6508
weighted avg       0.84      0.79      0.80      6508

model predictive power:  0.8838836203525119


In [25]:
## Logging model report

temp = model_report(LR_model,'Logistic Regression(os)', X_test, Y_test, y_pred_os)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

In [26]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_iter':(10000,20000,30000,40000,50000,100000)}

model = LogisticRegression()
clf = GridSearchCV(model,parameters,cv=5,verbose=1,n_jobs=-1)
clf.fit(X_scaler_os,Y_train_os)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'max_iter': (10000, 20000, 30000, 40000, 50000,
                                      100000)},
             verbose=1)

In [27]:
clf.best_params_

{'max_iter': 10000}

In [28]:
clf.best_score_

0.7966548154222506

In [29]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

bag_clf = BaggingClassifier(LogisticRegression(max_iter=10000),max_features = 0.1, n_estimators=500, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_scaler_os, Y_train_os)
Y_pred = bag_clf.predict(X_scaler_test)

print(accuracy_score(Y_test, Y_pred))

0.8080823601720959


In [30]:
## Logging model report

temp = model_report(bag_clf,'Bagging Classifier(Logistic Regression)', X_scaler_test, Y_test, y_pred)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

## Decision Tree

In [31]:
from sklearn.tree import DecisionTreeClassifier

DT_model = DecisionTreeClassifier(max_depth=8, min_samples_split=62)

In [32]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

DT_model.fit(X_train,Y_train)
y_pred = DT_model.predict(X_test)

print('Accuracy:',accuracy_score(Y_test,y_pred))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred))
print('classification report:\n',classification_report(Y_test,y_pred))
print('model predictive power: ',roc_auc_score(Y_test,DT_model.predict_proba(X_test)[:, 1]))

Accuracy: 0.8299016594960049
confusion matrix:
 [[4439  466]
 [ 641  962]]
classification report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.89      4905
           1       0.67      0.60      0.63      1603

    accuracy                           0.83      6508
   macro avg       0.77      0.75      0.76      6508
weighted avg       0.82      0.83      0.83      6508

model predictive power:  0.8754924348650561


In [33]:
## Logging model report

temp = model_report(DT_model,'Decision Tree', X_test, Y_test, y_pred)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

### Over Sample X_train and Y_train

In [34]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

DT_model.fit(X_train_os,Y_train_os)
y_pred_os = DT_model.predict(X_test)

print('Accuracy:',accuracy_score(Y_test,y_pred_os))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred_os))
print('classification report:\n',classification_report(Y_test,y_pred_os))
print('model predictive power: ',roc_auc_score(Y_test,DT_model.predict_proba(X_test)[:, 1]))

Accuracy: 0.7679778733866011
confusion matrix:
 [[3658 1247]
 [ 263 1340]]
classification report:
               precision    recall  f1-score   support

           0       0.93      0.75      0.83      4905
           1       0.52      0.84      0.64      1603

    accuracy                           0.77      6508
   macro avg       0.73      0.79      0.73      6508
weighted avg       0.83      0.77      0.78      6508

model predictive power:  0.8728771168737517


In [35]:
## Logging model report

temp = model_report(DT_model,'Decision Tree(OS)', X_test, Y_test, y_pred_os)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[5,6,7,8,9,10],
              'min_samples_split':list(range(2,101,10))
}

DT_clf = GridSearchCV(DecisionTreeClassifier(),parameters,verbose=1,n_jobs=-1,cv=5)
DT_clf.fit(X,Y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 12, 22, 32, 42, 52, 62, 72,
                                               82, 92]},
             verbose=1)

In [37]:
DT_clf.best_estimator_

DecisionTreeClassifier(max_depth=8, min_samples_split=52)

In [38]:
DT_clf.best_score_

0.8296096520894853

In [39]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=8, min_samples_split=62,random_state=42),max_features = 0.1, n_estimators=500, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, Y_train)
Y_pred = bag_clf.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.7536877688998156


In [40]:
## Logging model report

temp = model_report(bag_clf,'Bagging Classifier(Decision Tree)', X_test, Y_test, y_pred)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

## Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, min_samples_split=10, n_jobs=-1, random_state=42,max_features='sqrt')

rnd_clf.fit(X_train, Y_train)

RandomForestClassifier(max_features='sqrt', min_samples_split=10,
                       n_estimators=500, n_jobs=-1, random_state=42)

In [42]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

rnd_clf.fit(X_train, Y_train)
y_pred = rnd_clf.predict(X_test)

print('Accuracy:',accuracy_score(Y_test,y_pred))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred))
print('classification report:\n',classification_report(Y_test,y_pred))
print('model predictive power: ',roc_auc_score(Y_test,rnd_clf.predict_proba(X_test)[:, 1]))

Accuracy: 0.8369698832206515
confusion matrix:
 [[4536  369]
 [ 692  911]]
classification report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.90      4905
           1       0.71      0.57      0.63      1603

    accuracy                           0.84      6508
   macro avg       0.79      0.75      0.76      6508
weighted avg       0.83      0.84      0.83      6508

model predictive power:  0.8860197781555099


In [43]:
## Logging model report

temp = model_report(rnd_clf,'Random Forest', X_test, Y_test, y_pred)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

### Over Sample X_train and Y_train

In [44]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

rnd_clf.fit(X_train_os, Y_train_os)
y_pred_os = rnd_clf.predict(X_test)

print('Accuracy:',accuracy_score(Y_test,y_pred_os))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred_os))
print('classification report:\n',classification_report(Y_test,y_pred_os))
print('model predictive power: ',roc_auc_score(Y_test,rnd_clf.predict_proba(X_test)[:, 1]))

Accuracy: 0.8200676090964967
confusion matrix:
 [[4191  714]
 [ 457 1146]]
classification report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88      4905
           1       0.62      0.71      0.66      1603

    accuracy                           0.82      6508
   macro avg       0.76      0.78      0.77      6508
weighted avg       0.83      0.82      0.82      6508

model predictive power:  0.8818813857554293


In [45]:
## Logging model report

temp = model_report(rnd_clf,'Random Forest(OS)', X_test, Y_test, y_pred_os)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

In [46]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'min_samples_split':[2,4,6,8,10,12,14],
              'max_features':['auto', 'sqrt', 'log2'],
              'max_leaf_nodes':[2,4,6,8,10,12,14,16,18,20]
             }

rs_RF = RandomizedSearchCV(RandomForestClassifier(n_estimators=100),parameters,cv=5,n_iter=50,verbose=1,n_jobs=-1)
rs_RF.fit(X_train_os,Y_train_os)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'max_leaf_nodes': [2, 4, 6, 8, 10, 12,
                                                           14, 16, 18, 20],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14]},
                   verbose=1)

In [47]:
rs_RF.best_estimator_

RandomForestClassifier(max_leaf_nodes=20, min_samples_split=6)

In [48]:
rs_RF.best_score_

0.7991543981734596

## Gradient Boosting

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

Grad_clf = GradientBoostingClassifier(max_depth=5,random_state=42)

In [50]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

Grad_clf.fit(X_train, Y_train)
y_pred = rnd_clf.predict(X_test)

print('Accuracy:',accuracy_score(Y_test,y_pred))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred))
print('classification report:\n',classification_report(Y_test,y_pred))
print('model predictive power: ',roc_auc_score(Y_test,Grad_clf.predict_proba(X_test)[:, 1]))

Accuracy: 0.8200676090964967
confusion matrix:
 [[4191  714]
 [ 457 1146]]
classification report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88      4905
           1       0.62      0.71      0.66      1603

    accuracy                           0.82      6508
   macro avg       0.76      0.78      0.77      6508
weighted avg       0.83      0.82      0.82      6508

model predictive power:  0.8919581213359508


In [51]:
## Logging model report

temp = model_report(Grad_clf,'Gradient Boosting', X_test, Y_test, y_pred)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

### Over Sample X_train and Y_train

In [52]:
# from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score

Grad_clf.fit(X_train_os, Y_train_os)
y_pred_os = Grad_clf.predict(X_test)

print('Accuracy:',accuracy_score(Y_test,y_pred_os))
print('confusion matrix:\n',confusion_matrix(Y_test,y_pred_os))
print('classification report:\n',classification_report(Y_test,y_pred_os))
print('model predictive power: ',roc_auc_score(Y_test,Grad_clf.predict_proba(X_test)[:, 1]))

Accuracy: 0.8016287645974186
confusion matrix:
 [[3886 1019]
 [ 272 1331]]
classification report:
               precision    recall  f1-score   support

           0       0.93      0.79      0.86      4905
           1       0.57      0.83      0.67      1603

    accuracy                           0.80      6508
   macro avg       0.75      0.81      0.77      6508
weighted avg       0.84      0.80      0.81      6508

model predictive power:  0.8926863685126576


In [53]:
## Logging model report

temp = model_report(Grad_clf,'Gradient Boosting(OS)', X_test, Y_test, y_pred_os)
Model_summary = pd.concat([Model_summary,temp],ignore_index=True)

In [54]:
Model_summary

Unnamed: 0,Models,Accuracy_Score,0_precision,0_recall,0_f1_score,1_precision,1_recall,1_f1_score,AUC_score
0,Logistic Regression,0.7878,0.934847,0.772273,0.845819,0.545195,0.835309,0.659768,0.88404
1,Logistic Regression(os),0.785341,0.934373,0.769215,0.843788,0.5417,0.834685,0.65701,0.503874
2,Bagging Classifier(Logistic Regression),0.7878,0.934847,0.772273,0.845819,0.545195,0.835309,0.659768,0.869475
3,Decision Tree,0.829902,0.873819,0.904995,0.889134,0.673669,0.600125,0.634774,0.875492
4,Decision Tree(OS),0.767978,0.932925,0.74577,0.828915,0.517974,0.835933,0.639618,0.872877
5,Bagging Classifier(Decision Tree),0.829902,0.873819,0.904995,0.889134,0.673669,0.600125,0.634774,0.878336
6,Random Forest,0.83697,0.867636,0.924771,0.895293,0.711719,0.568309,0.631981,0.88602
7,Random Forest(OS),0.820068,0.901678,0.854434,0.877421,0.616129,0.71491,0.661854,0.881881
8,Gradient Boosting,0.820068,0.901678,0.854434,0.877421,0.616129,0.71491,0.661854,0.891958
9,Gradient Boosting(OS),0.801629,0.934584,0.792253,0.857553,0.566383,0.830318,0.673413,0.892686


# Model Selection

Random Forest with oversampling and Gradient Boosting works better on average for both precision and recall of both classes compared to other models. As Gradient Boosting is a complex model than Random Forest, So I decided to go with Random Forest.