In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedShuffleSplit

  from numpy.core.umath_tests import inner1d


#### Read Data

In [3]:
df = pd.read_csv('Claim_Prediction_Modelling.csv')
df.shape

(595212, 59)

In [0]:
def get_top_feature(df, target_column, n):
    return df.corr().unstack().sort_values(ascending=False)[target_column][0:n]
get_top_feature(df, 'target', 20)

target           1.000000
ps_car_13        0.053286
ps_car_12        0.037118
ps_ind_17_bin    0.035395
ps_reg_02        0.035061
ps_reg_03        0.033177
ps_ind_07_bin    0.032998
ps_car_04_cat    0.031879
ps_ind_05_cat    0.029722
ps_car_03_cat    0.027955
ps_car_15        0.026171
ps_reg_01        0.022609
ps_ind_01        0.020593
ps_car_05_cat    0.019276
ps_ind_08_bin    0.016403
ps_car_01_cat    0.016085
ps_ind_04_cat    0.012593
ps_car_06_cat    0.010526
ps_ind_03        0.009443
ps_ind_12_bin    0.008413
dtype: float64

#### Split Train and Test Data

In [0]:
target_col = 'target'
feature_cols = [c for c in df.columns if c != target_col]
#feature_cols = ['ps_car_13','ps_car_12','ps_ind_17_bin','ps_reg_02','ps_reg_03','ps_ind_07_bin','ps_car_04_cat','ps_ind_05_cat','ps_car_03_cat',
#               'ps_car_15','ps_reg_01','ps_ind_01','ps_car_05_cat','ps_ind_08_bin','ps_car_01_cat','ps_ind_04_cat','ps_car_06_cat','ps_ind_03','ps_ind_12_bin']

scaler = StandardScaler()

df[feature_cols] = scaler.fit_transform(df[feature_cols])


X =  df[feature_cols]
y = df[target_col]

# OverSample to negate Class Imbalance
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)

#### Create a Model Pipeline

In [0]:
def model_evaluation(model_type):
    model = model_type
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pp = model.predict_proba(X_test)
    print(confusion_matrix(y_test, y_pred))
    print('Accuracy =',metrics.accuracy_score(y_test, y_pred))
    print('Precision =',metrics.precision_score(y_test, y_pred))
    print('Recall =',metrics.recall_score(y_test, y_pred))
    print('F1 =',metrics.f1_score(y_test, y_pred))

#### Logistic Regression

In [0]:
model_evaluation(model_type=LogisticRegression(solver='lbfgs', class_weight='auto', random_state=0))

[[38577 22387]
 [26937 33774]]
Accuracy = 0.5946250256831724
Precision = 0.6013781805879524
Recall = 0.5563077531254632
F1 = 0.5779656376206447


#### Random Forest

In [0]:
model_evaluation(model_type=RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0))

[[39401 21563]
 [29300 31411]]
Accuracy = 0.5819765769467845
Precision = 0.5929512591082418
Recall = 0.5173856467526478
F1 = 0.552597088446145


#### Naive Bayes

In [0]:
model_evaluation(model_type=GaussianNB())

[[49016 11948]
 [39891 20820]]
Accuracy = 0.5739552085473598
Precision = 0.6353759765625
Recall = 0.34293620595938135
F1 = 0.44544764064656234


#### AdaBoost

In [0]:
model_evaluation(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=10, random_state=0))

[[36927 24037]
 [25871 34840]]
Accuracy = 0.589825354427779
Precision = 0.5917421064252595
Recall = 0.5738663504142577
F1 = 0.582667157239857


#### XGBoost

In [70]:
model_evaluation(model_type=XGBClassifier(max_depth=10, learning_rate=0.25, n_estimators=10, random_state=0))

[[47614 13350]
 [11924 48787]]
Accuracy = 0.7922827203616191
Precision = 0.7851521637671597
Recall = 0.803594076855924
F1 = 0.7942660849179474
