In [17]:
!pip install imblearn

Collecting imblearnNote: you may need to restart the kernel to use updated packages.

  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [146]:
#import libraries
import pandas as pd
import numpy as np

#importing liabraries for graphical representation
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import matplotlib.patches as mpatches
from matplotlib import rcParams

#Packages for feature engineering
from sklearn.model_selection import train_test_split, StratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from statsmodels.tools.eval_measures import rmse
from scipy import stats
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler,NearMiss

from sklearn.metrics import classification_report,f1_score

from imblearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')
 


In [147]:
df=pd.read_csv('data_travel_insurance.csv')

In [148]:
# Checking the distribution of the claims

(df.groupby('Claim').size()/df.shape[0]).round(4)

Claim
No     0.9847
Yes    0.0153
dtype: float64

In [149]:
df['Net Sales'] = np.abs(df['Net Sales'])
premium=df.copy()
cust_claim=df.copy()

In [150]:
# Identifikasi kemungkinan customer claim

def data_pre_processing(cust_claim):

    # Menghapus kolom Gender
    cust_claim.drop('Gender',axis =1, inplace=True)
    
    # Durasi yang memiliki nilai kurang dari nol akan diisi oleh angka median
    cust_claim['Duration'][cust_claim['Duration'] < 0] = cust_claim['Duration'].median()
    
    # Durasi untuk asuransi travelling sepantasnya tidak mencapai angka 4000 lebih, oleh karena itu kita akan mengambil 
    # maksimal 2 tahun dan menggangap tahun kedua adalah tahun kabisat
    cust_claim ['Duration'][cust_claim['Duration'] > 731] = 731
    
    #Mengubah angka 118 menggunakan angka rata-rata umur
    cust_claim['Age'][cust_claim['Age'] >100] = cust_claim['Age'].mean()
    
    cust_claim['Agency Type']=np.where(cust_claim['Agency Type']=='Airlines',1,0)
    cust_claim['Distribution Channel']=np.where(cust_claim['Distribution Channel']=='Online',1,0)
    cust_claim['Claim']=np.where(cust_claim['Claim']=='Yes',1,0)

In [151]:
data_pre_processing(cust_claim)

In [152]:
def feature_processing(encoding):
    # Converting all categorical columns into numeric using frequency encoding, label encoding and one-hot encoding.
    fe = encoding.groupby('Destination').size()/len(encoding)
    encoding.loc[:,'Dest_fe'] = encoding['Destination'].map(fe)
    fe_1 = encoding.groupby('Agency').size()/len(encoding)
    encoding.loc[:,'Agency_fe'] = encoding['Agency'].map(fe_1)
    fe_2 = encoding.groupby('Product Name').size()/len(encoding)    
    encoding.loc[:,'Product Name_fe'] = encoding['Product Name'].map(fe_2)
    encoding.drop(columns='Agency',axis=1,inplace=True)
    encoding.drop(columns='Destination',axis=1,inplace=True)
    encoding.drop(columns='Product Name',axis=1,inplace=True)

In [153]:
feature_processing(cust_claim)

In [158]:
X = cust_claim[['Agency Type','Duration','Net Sales','Commision (in value)']]
y = cust_claim['Claim']

In [159]:
X.shape

(44328, 4)

In [160]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y, 
    stratify=y,  
    train_size = 0.2,
    random_state = 2020
)

In [161]:
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval,
    y_trainval, 
    stratify=y_trainval,  
    test_size = 0.2,
    random_state = 2020
)

In [162]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

0.9797817443532696
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     34921
           1       0.04      0.01      0.02       542

    accuracy                           0.98     35463
   macro avg       0.51      0.50      0.51     35463
weighted avg       0.97      0.98      0.97     35463



In [163]:
logr = LogisticRegression()
logr.fit(X_train,y_train)
y_pred = logr.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9846600682401376
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     34921
           1       0.00      0.00      0.00       542

    accuracy                           0.98     35463
   macro avg       0.49      0.50      0.50     35463
weighted avg       0.97      0.98      0.98     35463



In [164]:
y_pred=logr.predict(X_val)
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1746
           1       0.00      0.00      0.00        27

    accuracy                           0.98      1773
   macro avg       0.49      0.50      0.50      1773
weighted avg       0.97      0.98      0.98      1773



# Penalized Logistic Regression(2)

In [167]:
model=LogisticRegression(class_weight='balanced')
skf=StratifiedKFold(n_splits=5)
model_cv=cross_val_score(rf, X_trainval, y_trainval,cv=skf, scoring='f1')

In [168]:
print(model_cv)
print(model_cv.mean())

[0. 0. 0. 0. 0.]
0.0


# Logistic Reggression with SMOTE

In [169]:
smote=SMOTE()
model=LogisticRegression()

pipe_model=Pipeline([
    ('balance',smote),
    ('cif',model)
])
skf=StratifiedKFold(n_splits=5)

In [172]:
model_smote_cv=cross_val_score(pipe_model,X_trainval,y_trainval, cv=skf, scoring='f1')

In [173]:
print(model_smote_cv)
print(model_smote_cv.mean())

[0.05970149 0.06185567 0.06219313 0.06101695 0.05638474]
0.0602303961535538


# Final Model Performance

In [174]:
model=LogisticRegression(class_weight='balanced')
model.fit(X_trainval,y_trainval)

In [175]:
y_pred=model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.70      0.82     34921
           1       0.04      0.72      0.07       542

    accuracy                           0.70     35463
   macro avg       0.51      0.71      0.45     35463
weighted avg       0.98      0.70      0.81     35463



# UnderSample

In [111]:
rus=RandomUnderSampler()
X_under,y_under=rus.fit_resample(X_train,y_train)
    

In [112]:
len(y_under)

216

In [113]:
len(y_train)

7092

In [114]:
model_rus=LogisticRegression()
model_rus.fit(X_under,y_under)

In [116]:
y_pred=model_rus.predict(X_val)
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.69      0.82      1746
           1       0.04      0.78      0.07        27

    accuracy                           0.69      1773
   macro avg       0.52      0.73      0.44      1773
weighted avg       0.98      0.69      0.80      1773



# Penalized Logistic Regression

In [118]:
model_blanced=LogisticRegression(class_weight='balanced')
model_blanced.fit(X_train,y_train)

In [119]:
y_pred=model_blanced.predict(X_val)
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84      1746
           1       0.04      0.78      0.08        27

    accuracy                           0.73      1773
   macro avg       0.52      0.75      0.46      1773
weighted avg       0.98      0.73      0.83      1773

