In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc

  from numpy.core.umath_tests import inner1d


In [2]:
df = pd.read_csv('Loss_Ratio_Modelling.csv')
df.shape

(9134, 29)

In [0]:
from google.colab import files
uploaded = files.upload()

Saving Auto_Insurance_Claims_Modelling.csv to Auto_Insurance_Claims_Modelling (2).csv


In [0]:
from pandas.compat import StringIO
from io import StringIO

df = pd.read_csv(StringIO(uploaded['Auto_Insurance_Claims_Modelling.csv'].decode('utf-8')))
df.shape

(9134, 29)

In [0]:
target_col = 'Loss_Ratio_Classified'
feature_cols = ['EmploymentStatus','Gender','Income','Location Code',
                'Months Since Last Claim','Vehicle Class']

scaler = MinMaxScaler()

df[feature_cols] = scaler.fit_transform(df[feature_cols])

X =  df[feature_cols]
y = df[target_col]


from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)




In [0]:
def model_evaluation(model_type):
    model = model_type
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pp = model.predict_proba(X_test)
    print(confusion_matrix(y_test, y_pred))
    print('Accuracy =',metrics.accuracy_score(y_test, y_pred))
    print('Precision =',metrics.precision_score(y_test, y_pred))
    print('Recall =',metrics.recall_score(y_test, y_pred))
    print('F1 =',metrics.f1_score(y_test, y_pred))

#### Logistic Regression

In [0]:
model_evaluation(model_type=LogisticRegression(solver='lbfgs', class_weight='auto'))

[[ 908  983]
 [ 783 1074]]
Accuracy = 0.5288153681963714
Precision = 0.5221195916383082
Recall = 0.5783521809369951
F1 = 0.5487991824220746


#### Random Forest

In [0]:
model_evaluation(model_type=RandomForestClassifier(n_estimators=10, max_depth=2, random_state=0))

[[ 869 1022]
 [ 715 1142]]
Accuracy = 0.5365528281750267
Precision = 0.5277264325323475
Recall = 0.6149703823371029
F1 = 0.568017905993534


#### SVM

In [0]:
model_evaluation(model_type=svm.SVC(gamma='scale', probability=True, random_state=0))

[[ 638 1253]
 [ 458 1399]]
Accuracy = 0.5434898612593383
Precision = 0.527526395173454
Recall = 0.7533656435110393
F1 = 0.6205367043690396


#### Naive Bayes

In [0]:
model_evaluation(model_type=GaussianNB())

[[ 869 1022]
 [ 785 1072]]
Accuracy = 0.5178762006403416
Precision = 0.5119388729703916
Recall = 0.5772751750134626
F1 = 0.5426474310301189


#### AdaBoost

In [0]:
model_evaluation(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=10, random_state=0))

[[ 715 1176]
 [ 555 1302]]
Accuracy = 0.5381536819637139
Precision = 0.5254237288135594
Recall = 0.7011308562197092
F1 = 0.6006920415224912


#### XGBoost

In [0]:
model_evaluation(model_type=XGBClassifier(max_depth=20, learning_rate=0.25, n_estimators=10, random_state=0))

[[1438  453]
 [ 204 1653]]
Accuracy = 0.8247065101387406
Precision = 0.7849002849002849
Recall = 0.8901453957996769
F1 = 0.834216502649508
