In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedShuffleSplit

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Read Data

In [4]:
df = pd.read_csv('/content/drive/My Drive/Data Science/Capstone Project/Claim_Prediction_Modelling.csv')
df.shape

(595212, 59)

In [5]:
def get_top_feature(df, target_column, n):
    return df.corr().unstack().sort_values(ascending=False)[target_column][0:n]
get_top_feature(df, 'target', 20)

target           1.000000
ps_car_13        0.053899
ps_car_12        0.038790
ps_ind_17_bin    0.037053
ps_reg_02        0.034800
ps_ind_07_bin    0.034218
ps_car_04_cat    0.032900
ps_car_03_cat    0.032401
ps_reg_03        0.030888
ps_ind_05_cat    0.029165
ps_car_15        0.027667
ps_reg_01        0.022888
ps_car_05_cat    0.020754
ps_ind_01        0.018570
ps_car_01_cat    0.016256
ps_ind_08_bin    0.013147
ps_car_06_cat    0.011537
ps_ind_04_cat    0.009360
ps_ind_03        0.008360
ps_ind_12_bin    0.007810
dtype: float64

#### Split Train and Test Data

In [6]:
target_col = 'target'
feature_cols = [c for c in df.columns if c != target_col]
#feature_cols = ['ps_car_13','ps_car_12','ps_ind_17_bin','ps_reg_02','ps_reg_03','ps_ind_07_bin','ps_car_04_cat','ps_ind_05_cat','ps_car_03_cat',
#               'ps_car_15','ps_reg_01','ps_ind_01','ps_car_05_cat','ps_ind_08_bin','ps_car_01_cat','ps_ind_04_cat','ps_car_06_cat','ps_ind_03','ps_ind_12_bin']

scaler = StandardScaler()

df[feature_cols] = scaler.fit_transform(df[feature_cols])


X =  df[feature_cols]
y = df[target_col]

# OverSample to negate Class Imbalance
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)



#### Create a Model Pipeline

In [0]:
def model_evaluation(model_type):
    model = model_type
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pp = model.predict_proba(X_test)
    print(confusion_matrix(y_test, y_pred))
    print('Accuracy =',metrics.accuracy_score(y_test, y_pred))
    print('Precision =',metrics.precision_score(y_test, y_pred))
    print('Recall =',metrics.recall_score(y_test, y_pred))
    print('F1 =',metrics.f1_score(y_test, y_pred))

#### Logistic Regression

In [13]:
model_evaluation(model_type=LogisticRegression(solver='lbfgs', class_weight='auto', random_state=0))

[[89886 53789]
 [63906 79178]]
Accuracy = 0.5895682437168493
Precision = 0.5954710567283612
Recall = 0.5533672528025495
F1 = 0.5736476230841402


#### Random Forest

In [14]:
model_evaluation(model_type=RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0))

[[82266 61409]
 [60847 82237]]
Accuracy = 0.5736629016002985
Precision = 0.5724976678779778
Recall = 0.5747463028710408
F1 = 0.5736197816761414


#### Naive Bayes

In [15]:
model_evaluation(model_type=GaussianNB())

[[112708  30967]
 [ 91334  51750]]
Accuracy = 0.5735059754009464
Precision = 0.6256271383149776
Recall = 0.36167565905342314
F1 = 0.45836820917533583


#### AdaBoost

In [18]:
model_evaluation(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=10, random_state=0))

[[84016 59659]
 [59460 83624]]
Accuracy = 0.584602401319575
Precision = 0.5836282043229134
Recall = 0.584439909423835
F1 = 0.5840337748413749


#### XGBoost

In [12]:
model_evaluation(model_type=XGBClassifier(max_depth=10, learning_rate=1, n_estimators=10, random_state=0))

[[103085  40590]
 [ 34015 109069]]
Accuracy = 0.7398337977186418
Precision = 0.7287834343407346
Recall = 0.7622725112521316
F1 = 0.7451518909077246
