In [1]:
import numpy as np
import pandas as pd

#loading the training and test data
insurance_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/travel_insurance/Training_set_label.csv" )
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/travel_insurance/Testing_set_label.csv')

In [2]:
#taking a look at the dataset
insurance_data.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Gender,Age,Claim
0,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,61,UNITED KINGDOM,19.8,11.88,,29,0
1,EPX,Travel Agency,Online,Cancellation Plan,93,NEW ZEALAND,63.0,0.0,,36,0
2,EPX,Travel Agency,Online,2 way Comprehensive Plan,22,UNITED STATES,22.0,0.0,,25,0
3,C2B,Airlines,Online,Silver Plan,14,SINGAPORE,54.5,13.63,M,24,0
4,EPX,Travel Agency,Online,Cancellation Plan,90,VIET NAM,10.0,0.0,,23,0


In [3]:
#also the variable types and information distribution
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48260 entries, 0 to 48259
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency                48260 non-null  object 
 1   Agency Type           48260 non-null  object 
 2   Distribution Channel  48260 non-null  object 
 3   Product Name          48260 non-null  object 
 4   Duration              48260 non-null  int64  
 5   Destination           48260 non-null  object 
 6   Net Sales             48260 non-null  float64
 7   Commision (in value)  48260 non-null  float64
 8   Gender                13899 non-null  object 
 9   Age                   48260 non-null  int64  
 10  Claim                 48260 non-null  int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 4.1+ MB


In [4]:
#checking the NaN values across the columns
insurance_data.isna().sum()

Agency                      0
Agency Type                 0
Distribution Channel        0
Product Name                0
Duration                    0
Destination                 0
Net Sales                   0
Commision (in value)        0
Gender                  34361
Age                         0
Claim                       0
dtype: int64

In [5]:
#checking the percentage of NaN values in the Gender column
round(insurance_data.Gender.isna().sum()/insurance_data.shape[0],2)

0.71

In [6]:
insurance_data.Gender.fillna('Not Specified',inplace=True)
insurance_data.Gender

0        Not Specified
1        Not Specified
2        Not Specified
3                    M
4        Not Specified
             ...      
48255    Not Specified
48256                F
48257    Not Specified
48258    Not Specified
48259    Not Specified
Name: Gender, Length: 48260, dtype: object

In [7]:
insurance_data.drop(columns=['Distribution Channel', 'Destination','Agency Type'],inplace=True)
insurance_data

Unnamed: 0,Agency,Product Name,Duration,Net Sales,Commision (in value),Gender,Age,Claim
0,CWT,Rental Vehicle Excess Insurance,61,19.8,11.88,Not Specified,29,0
1,EPX,Cancellation Plan,93,63.0,0.00,Not Specified,36,0
2,EPX,2 way Comprehensive Plan,22,22.0,0.00,Not Specified,25,0
3,C2B,Silver Plan,14,54.5,13.63,M,24,0
4,EPX,Cancellation Plan,90,10.0,0.00,Not Specified,23,0
...,...,...,...,...,...,...,...,...
48255,EPX,2 way Comprehensive Plan,39,44.0,0.00,Not Specified,36,0
48256,C2B,Bronze Plan,20,60.0,15.00,F,30,0
48257,CWT,Rental Vehicle Excess Insurance,19,29.7,17.82,Not Specified,43,0
48258,EPX,2 way Comprehensive Plan,7,20.0,0.00,Not Specified,61,0


In [8]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(columns=col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df;

dummy_na = False
dummy_cols =['Agency', 'Product Name','Gender']

#dummifying the columns above
insurance_data = create_dummy_df(insurance_data,dummy_cols,dummy_na)
insurance_data

Unnamed: 0,Duration,Net Sales,Commision (in value),Age,Claim,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,Agency_CSR,...,Product Name_Single Trip Travel Protect Gold,Product Name_Single Trip Travel Protect Platinum,Product Name_Single Trip Travel Protect Silver,Product Name_Spouse or Parents Comprehensive Plan,Product Name_Ticket Protector,Product Name_Travel Cruise Protect,Product Name_Travel Cruise Protect Family,Product Name_Value Plan,Gender_M,Gender_Not Specified
0,61,19.8,11.88,29,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,93,63.0,0.00,36,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,22,22.0,0.00,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,14,54.5,13.63,24,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,90,10.0,0.00,23,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48255,39,44.0,0.00,36,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
48256,20,60.0,15.00,30,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48257,19,29.7,17.82,43,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
48258,7,20.0,0.00,61,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
from sklearn.model_selection import train_test_split


y = insurance_data['Claim']
X = insurance_data.drop(['Claim'], axis = 1)


train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.30, random_state=101)

In [11]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=25,sampling_strategy=1)
train_X_smote, train_y_smote = sm.fit_resample(train_X, train_y)

In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_X_smote, train_y_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

y_pred_smote_test = clf.predict(test_X)


# evaluate predictions
accuracy_smote_train = accuracy_score(train_y_smote, clf.predict(train_X_smote))
f1_score_smote_train = f1_score(train_y_smote, clf.predict(train_X_smote))


accuracy_smote_test = accuracy_score(test_y, y_pred_smote_test)
f1_score_smote_test = f1_score(test_y, y_pred_smote_test)

print("Train: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_train * 100.0))
print("Train: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_train * 100.00))

print("Test: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_test * 100.0))
print("Test: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_test * 100.00))

Train: Accuracy Logistic Regression: 80.09%
Train: F1 Score Logistic Regression: 79.57
Test: Accuracy Logistic Regression: 82.56%
Test: F1 Score Logistic Regression: 7.07


In [16]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(random_state=1).fit(train_X_smote, train_y_smote)


y_pred_smote_test = DTC.predict(test_X)


# evaluate predictions
accuracy_smote_train = accuracy_score(train_y_smote, DTC.predict(train_X_smote))
f1_score_smote_train = f1_score(train_y_smote, DTC.predict(train_X_smote))


accuracy_smote_test = accuracy_score(test_y, y_pred_smote_test)
f1_score_smote_test = f1_score(test_y, y_pred_smote_test)

print("Train: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_train * 100.0))
print("Train: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_train * 100.00))

print("Test: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_test * 100.0))
print("Test: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_test * 100.00))

Train: Accuracy Logistic Regression: 99.31%
Train: F1 Score Logistic Regression: 99.31
Test: Accuracy Logistic Regression: 95.04%
Test: F1 Score Logistic Regression: 8.42


In [18]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=1).fit(train_X_smote, train_y_smote)


y_pred_smote_test = RFC.predict(test_X)


# evaluate predictions
accuracy_smote_train = accuracy_score(train_y_smote, RFC.predict(train_X_smote))
f1_score_smote_train = f1_score(train_y_smote, RFC.predict(train_X_smote))


accuracy_smote_test = accuracy_score(test_y, y_pred_smote_test)
f1_score_smote_test = f1_score(test_y, y_pred_smote_test)

print("Train: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_train * 100.0))
print("Train: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_train * 100.00))

print("Test: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_test * 100.0))
print("Test: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_test * 100.00))

Train: Accuracy Logistic Regression: 99.31%
Train: F1 Score Logistic Regression: 99.31
Test: Accuracy Logistic Regression: 95.53%
Test: F1 Score Logistic Regression: 7.44


In [22]:
from sklearn.model_selection import GridSearchCV
    

parameters = {'bootstrap': [True],
'max_depth': [10, 20],
'min_samples_leaf': [3, 4],
'min_samples_split': [4, 6],
'n_estimators': [100, 200],
}

grid_search_1 = GridSearchCV(RandomForestClassifier(random_state=1), 
                   param_grid=parameters,
                   cv = 3,
                   n_jobs = -1,
                   verbose = 2)
grid_search_1.fit(train_X_smote, train_y_smote)
grid_search_1.best_params_

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.2min finished


{'bootstrap': True,
 'max_depth': 20,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 200}

In [23]:
parameters = {'max_leaf_nodes': [5, 10, 15, 20],
    'min_samples_split': [4, 6, 8, 10],
}

grid_search_2 = GridSearchCV(DecisionTreeClassifier(random_state=1), 
                   param_grid=parameters,
                   cv = 3,
                   n_jobs = -1,
                   verbose = 2)
grid_search_2.fit(train_X_smote, train_y_smote)
grid_search_2.best_params_

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    1.9s finished


{'max_leaf_nodes': 20, 'min_samples_split': 4}

In [24]:
model_grid1 = grid_search_1.best_estimator_
model_grid2 = grid_search_2.best_estimator_

In [25]:
y_pred_grid1 = model_grid1.predict(test_X)
y_pred_grid2 = model_grid2.predict(test_X)

# evaluate predictions

f1_score_grid1= f1_score(test_y, y_pred_grid1)
f1_score_grid2= f1_score(test_y, y_pred_grid2)


print("Grid1: F1 Score: %.2f%%" % (f1_score_grid1 * 100.00))

print("Grid2: F1 Score: %.2f%%" % (f1_score_grid2 * 100.00))

Grid1: F1 Score: 10.84%
Grid2: F1 Score: 11.81%


In [26]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator= RandomForestClassifier(random_state=1) , step = 1) 

fit = rfe.fit(train_X_smote, train_y_smote)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 23
Selected Features: [ True  True  True  True False  True False False False  True  True False
  True False  True False  True  True False  True False False  True False
 False False  True  True  True False  True False False False  True  True
 False False False False False  True False  True  True  True]
Feature Ranking: [ 1  1  1  1  3  1 15 11 17  1  1  8  1  2  1  6  1  1 20  1 12 19  1 22
 14 13  1  1  1 23  1  4 16 10  1  1  9 18  5 21  7  1 24  1  1  1]


In [28]:
selected_rfe_features = pd.DataFrame({'Feature':list(train_X_smote.columns),
                                      'Ranking':rfe.ranking_})
selected_rfe_features.sort_values(by='Ranking')

Unnamed: 0,Feature,Ranking
0,Duration,1
43,Product Name_Value Plan,1
41,Product Name_Travel Cruise Protect,1
35,Product Name_Silver Plan,1
34,Product Name_Rental Vehicle Excess Insurance,1
30,Product Name_Comprehensive Plan,1
28,Product Name_Cancellation Plan,1
27,Product Name_Bronze Plan,1
26,Product Name_Basic Plan,1
44,Gender_M,1


In [34]:
# Transforming the data
X_train_rfe = rfe.transform(train_X_smote)
X_test_rfe = rfe.transform(test_X)

# Fitting our baseline model with the transformed data
rfe_model = RandomForestClassifier(random_state=1).fit(X_train_rfe, train_y_smote)

y_pred_rfe = rfe_model.predict(X_test_rfe)


accuracy_smote_test = accuracy_score(test_y, y_pred_rfe)
f1_score_smote_test = f1_score(test_y, y_pred_rfe)



print("Test: Accuracy Logistic Regression: %.2f%%" % (accuracy_smote_test * 100.0))
print("Test: F1 Score Logistic Regression: %.2f%%" % (f1_score_smote_test * 100.00))

Test: Accuracy Logistic Regression: 95.28%
Test: F1 Score Logistic Regression: 7.57%
