In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Explorary Data Analysis

In [3]:
train = pd.read_csv('../input/2021-uofmn-travelers-modeling-competition/train_2021.csv')

In [4]:
train.shape

In [5]:
train['fraud'].value_counts()
#We know that the class is heavily skewed, the fraud rate is 15.6% as stated by Travelers.

In [6]:
train.isna().sum()

In [7]:
#Dropping data with missing value because there's so little of them
train.dropna(inplace=True)

In [8]:
train.isna().sum()

In [9]:
train.info()

In [10]:
#Make sure the datatype are appropriate
train['witness_present_ind']=train['witness_present_ind'].astype(int)
train['marital_status']=train['marital_status'].astype(int)
train['claim_number']=train['claim_number'].astype(str)
train['fraud']=train['fraud'].astype(str)
train['annual_income']=train['annual_income'].astype(float)
train['zip_code']=train['zip_code'].astype(str)
train['gender'] = train['gender'].astype('category')
train['living_status'] = train['living_status'].astype('category')
train['accident_site'] = train['accident_site'].astype('category')
train['channel'] = train['channel'].astype('category')
train['vehicle_category'] = train['vehicle_category'].astype('category')

In [11]:
train.info()

In [12]:
#Check for correlation between variables
plt.figure(figsize = (18, 12))

corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(data = corr, mask = mask, annot = True, fmt = '.2g', linewidth = 1)
plt.show()
#We see that age_of_driver are highly correlated with annual_income

In [13]:
#To fit a boxplot for outlier, only consider numerical variable
Z = train.select_dtypes(include = [('int64'),('float64')])

In [14]:
#Coefficient of variation
Z.var()/Z.mean()
#We know that some features have high variances where we will dealt with later by scaling

In [15]:
#Boxplot to check for outlier and data range
plt.figure(figsize = (20, 15))
plotnumber = 1

for col in Z.columns:
    if plotnumber <= 30:
        ax = plt.subplot(5, 5, plotnumber)
        sns.boxplot(Z[col])
        plt.xlabel(col, fontsize = 15)
    
    plotnumber += 1
plt.tight_layout()
plt.show()
#There's indeed outlier in the dataset

In [16]:
percentile = train['annual_income'].quantile(0.995)
train = train[train['annual_income'] < percentile]
train = train[train['annual_income'] > 25000]
train.shape

In [17]:
train = train[train['past_num_of_claims'] < 5]
train.shape

In [18]:
percentile = train['claim_est_payout'].quantile(0.995)
train = train[train['claim_est_payout'] < percentile]
train.shape

In [19]:
percentile = train['vehicle_price'].quantile(0.995)
train = train[train['vehicle_price'] < percentile]
train.shape

In [20]:
percentile = train['vehicle_weight'].quantile(0.995)
train = train[train['vehicle_weight'] < percentile]
train.shape

In [21]:
#Dropping the variable which we assumed unneeded in prediction.
#To improve algorithm running time, we decide to drop zip_code and claim_date, but will implement them in our R modelling
to_drop = ['claim_number','zip_code','claim_date','claim_day_of_week','vehicle_color']
train.drop(to_drop, inplace = True, axis = 1)

In [22]:
#We decided to drop age_of_driver as it is highly correlated with annual_income
#age_of_driver has some extremely high outlier e.g. as high as 200
train.drop(columns = ['age_of_driver'], inplace = True, axis = 1)

In [23]:
train.info()

# Data Preprocessing

In [24]:
#Separate predictors and response
X = train.drop('fraud', axis = 1)
y = train['fraud']

In [25]:
#Extracting categorical columns
cat_train = X.select_dtypes(include = ['object','category'])

In [26]:
cat_train.head()

In [27]:
for col in cat_train.columns:
    print(f"{col}: \n{cat_train[col].unique()}\n")

In [28]:
#So that machine can calculate numerical distance between data
cat_train = pd.get_dummies(cat_train, drop_first = True)

In [29]:
cat_train.head()

In [30]:
# extracting the numerical columns
num_train = X.select_dtypes(include = [('int64'),('float64')])

In [31]:
num_train.head()

In [32]:
# combining the Numerical and Categorical dataframes to get the final dataset

X = pd.concat([num_train, cat_train], axis = 1)

In [33]:
X.head()

In [34]:
X.info()

In [35]:
#Split data into train to model and test to evaluate
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 123)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [36]:
#Scaling data by standardization to deal with high variance

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [37]:
# example of random oversampling to balance the class distribution
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority',random_state=123)
# fit and apply the transform
X_train, y_train = oversample.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y_train))

# Model 1 - Logistic Regression Model

In [38]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [39]:
# model predictions

y_pred = log_reg.predict(X_test)

In [40]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,f1_score
#F1 score
print(f1_score(y_train, log_reg.predict(X_train),pos_label='1'))
log_reg_f1 = f1_score(y_test, log_reg.predict(X_test),pos_label='1')
print(log_reg_f1)

In [41]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [42]:
# classification report

print(classification_report(y_test, y_pred))

# Model 2 - K-Nearest Neighbors

In [43]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [44]:
# model predictions 

y_pred = knn.predict(X_test)

In [45]:
#F1 score
print(f1_score(y_train, knn.predict(X_train),pos_label='1'))
knn_f1 = f1_score(y_test, knn.predict(X_test),pos_label='1')
print(knn_f1)

In [46]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [47]:
# classification report

print(classification_report(y_test, y_pred))

# Model 3 - Support Vector Machine

In [48]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#svc = SVC()
#parameters = {
#    'gamma' : [0.001],
#    'C' : [0.5]
#}

#grid_search = GridSearchCV(svc, parameters)
#grid_search.fit(X_train, y_train)

#parameters = {
#    'gamma' : [0.0001, 0.001, 0.01, 0.1],
#    'C' : [0.01, 0.05, 0.5, 0.1, 1, 10, 15, 20]
#}
# Remove to run faster after knowing the best parameter

In [49]:
# best parameters

# grid_search.best_params_

In [50]:
# best accuracy 

# grid_search.best_score_

In [51]:
svc = SVC(C = 0.5, gamma = 0.001)
svc.fit(X_train, y_train)

In [52]:
# model predictions 

y_pred = svc.predict(X_test)

In [53]:
#F1 score
print(f1_score(y_train, svc.predict(X_train),pos_label='1'))
svc_f1 = f1_score(y_test, svc.predict(X_test),pos_label='1')
print(svc_f1)

In [54]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [55]:
# classification report

print(classification_report(y_test, y_pred))

# Model 4 - Stochastic Gradient Descent

In [56]:
from sklearn.linear_model import SGDClassifier

#sgd = SGDClassifier()
#parameters = {
#    'alpha' : [0.01],
#    'loss' : ['hinge'],
#    'penalty' : ['l2']
#}

#grid_search = GridSearchCV(sgd, parameters, cv = 10, n_jobs = -1)
#grid_search.fit(X_train, y_train)

#parameters = {
#    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
#    'loss' : ['hinge', 'log'],
#    'penalty' : ['l1', 'l2']
#}
# Remove to run faster after knowing the best parameter

In [57]:
# best parameter 

# grid_search.best_params_

In [58]:
sgd = SGDClassifier(alpha = 0.05, loss = 'hinge', penalty = 'l2')
sgd.fit(X_train, y_train)

In [59]:
# model predictions 

y_pred = sgd.predict(X_test)

In [60]:
#F1 score
print(f1_score(y_train, sgd.predict(X_train),pos_label='1'))
sgd_f1 = f1_score(y_test, sgd.predict(X_test),pos_label='1')
print(sgd_f1)

In [61]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [62]:
# classification report

print(classification_report(y_test, y_pred))

# Model 5 - Decision Tree

In [63]:
from sklearn.tree import DecisionTreeClassifier

#dtc = DecisionTreeClassifier()

#parameters = {
#    'criterion' : ['gini'],
#    'max_depth' : range(2, 4, 1),
#    'min_samples_leaf' : range(1, 4, 1),
#    'min_samples_split' : range(2, 6, 1),
#    'splitter' : ['best']
#}

#grid_search_dt = GridSearchCV(dtc, parameters, cv = 5, n_jobs = -1, verbose = 1)
#grid_search_dt.fit(X_train, y_train)
#parameters = {
#'criterion' : ['gini', 'entropy'],
#    'max_depth' : range(2, 4, 1),
#    'min_samples_leaf' : range(1, 4, 1),
#    'min_samples_split' : range(2, 6, 1),
#    'splitter' : ['best', 'random']
#}
#Removed in running so that algorithm can run faster after knowing the best parameter

In [64]:
# best parameters

#grid_search_dt.best_params_

In [65]:
# best score

#grid_search_dt.best_score_

In [66]:
dtc = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, min_samples_leaf = 1, min_samples_split = 2, splitter = 'best')
dtc.fit(X_train, y_train)

In [67]:
y_pred = dtc.predict(X_test)

In [68]:
#F1 score
print(f1_score(y_train, dtc.predict(X_train),pos_label='1'))
dtc_f1 = f1_score(y_test, dtc.predict(X_test),pos_label='1')
print(dtc_f1)

In [69]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [70]:
# classification report

print(classification_report(y_test, y_pred))

# Model 6 - Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier(criterion = 'gini', max_depth = 5, max_features = 'auto', min_samples_leaf = 3, min_samples_split = 3, n_estimators = 130)
rand_clf.fit(X_train, y_train)

In [72]:
y_pred = rand_clf.predict(X_test)

In [73]:
#F1 score
print(f1_score(y_train, rand_clf.predict(X_train),pos_label='1'))
rand_clf_f1 = f1_score(y_test, rand_clf.predict(X_test),pos_label='1')
print(rand_clf_f1)

In [74]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [75]:
# classification report

print(classification_report(y_test, y_pred))

# Model 7 - Voting Classifier

In [76]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Logistic Regression', log_reg), ('K Nearest Neighbours', knn), ('Support Vector Classifier', svc),
               ('Decision Tree', dtc)]

vc = VotingClassifier(estimators = classifiers)

vc.fit(X_train, y_train)

In [77]:
y_pred = vc.predict(X_test)

In [78]:
#F1 score
print(f1_score(y_train, vc.predict(X_train),pos_label='1'))
vc_f1 = f1_score(y_test, vc.predict(X_test),pos_label='1')
print(vc_f1)

In [79]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [80]:
# classification report

print(classification_report(y_test, y_pred))

# Model 8 - Adapting Boosting

In [81]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = log_reg)

ada = AdaBoostClassifier(log_reg, n_estimators = 200)
ada.fit(X_train, y_train)

In [82]:
y_pred = ada.predict(X_test)

In [83]:
#F1 score
print(f1_score(y_train, ada.predict(X_train),pos_label='1'))
ada_f1 = f1_score(y_test, ada.predict(X_test),pos_label='1')
print(ada_f1)

In [84]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [85]:
# classification report

print(classification_report(y_test, y_pred))

# Model 9 - Gradient Boosting

In [86]:
from sklearn.ensemble import GradientBoostingClassifier

#gbc = GradientBoostingClassifier()

#parameters = {
#    'loss': ['exponential'],
#    'learning_rate': [0.1],
#    'n_estimators': [150]
#}

#grid_search_gbc = GridSearchCV(gbc, parameters, cv = 5, n_jobs = -1, verbose = 1)
#grid_search_gbc.fit(X_train, y_train)

#parameters = {
#    'loss': ['deviance', 'exponential'],
#    'learning_rate': [0.001, 0.1, 1, 10],
#    'n_estimators': [100, 150, 180, 200]
#}

In [87]:
# best parameters 

#grid_search_gbc.best_params_

In [88]:
# best score

#grid_search_gbc.best_score_

In [89]:
#    'loss': ['deviance', 'exponential'],
#    'learning_rate': [0.001, 0.1, 1, 10],
#    'n_estimators': [100, 150, 180, 200]

In [90]:
gbc = GradientBoostingClassifier(learning_rate = 0.05, loss = 'deviance', n_estimators = 150)
gbc.fit(X_train, y_train)

In [91]:
y_pred = gbc.predict(X_test)

In [92]:
#F1 score
print(f1_score(y_train, gbc.predict(X_train),pos_label='1'))
gbc_f1 = f1_score(y_test, gbc.predict(X_test),pos_label='1')
print(gbc_f1)

In [93]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [94]:
# classification report

print(classification_report(y_test, y_pred))

# Model 10 - Stochastic Gradient Boosting

In [95]:
sgbc = GradientBoostingClassifier(max_depth=2, subsample=0.5, max_features=0.75, n_estimators=150, random_state=0)

sgbc.fit(X_train, y_train)

In [96]:
y_pred = sgbc.predict(X_test)

In [97]:
#F1 score
print(f1_score(y_train, sgbc.predict(X_train),pos_label='1'))
sgbc_f1 = f1_score(y_test, sgbc.predict(X_test),pos_label='1')
print(sgbc_f1)

In [98]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [99]:
# classification report

print(classification_report(y_test, y_pred))

# Model 11 - Extreme Gradient Boosting

In [100]:
from xgboost import XGBClassifier 

xgb = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.25, max_depth = 3, n_estimators = 150)

xgb.fit(X_train, y_train)

In [101]:
y_pred = xgb.predict(X_test)

In [102]:
#F1 score
print(f1_score(y_train, xgb.predict(X_train),pos_label='1'))
xgb_f1 = f1_score(y_test, xgb.predict(X_test),pos_label='1')
print(xgb_f1)

In [103]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [104]:
# classification report

print(classification_report(y_test, y_pred))

# Model 12 - Gaussian Naives Bayes

In [105]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [106]:
print(f1_score(y_train, gnb.predict(X_train),pos_label='1'))
gnb_f1 = f1_score(y_test, gnb.predict(X_test),pos_label='1')
print(gnb_f1)

In [107]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [108]:
# classification report

print(classification_report(y_test, y_pred))

# F-measure Score of 12 Classification Algorithm

In [109]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'KNN', 'SVC', 'SGD Classifier', 'Decision Tree Classifier', 'Random Forest Classifier', 'Voting Classifier', 'Ada Boost Classifier',
             'Gradient Boosting Classifier', 'Stochastic Gradient Boosting', 'XgBoost',
             'GaussianNB'],
    'F1 Score': [log_reg_f1, knn_f1, svc_f1, sgd_f1, dtc_f1, rand_clf_f1, vc_f1, ada_f1, gbc_f1, sgbc_f1, xgb_f1,gnb_f1]
})

models.sort_values(by = 'F1 Score', ascending = False)

# Predicting Test Set with Stochastic Gradient Boosting

In [110]:
test = pd.read_csv('../input/2021-uofmn-travelers-modeling-competition/test_2021.csv')

In [111]:
test['marital_status'] = test['marital_status'].fillna(test['marital_status'].mode()[0])
test['witness_present_ind'] = test['witness_present_ind'].fillna(test['witness_present_ind'].mode()[0])
test['claim_est_payout'] = test['claim_est_payout'].fillna(test['claim_est_payout'].mode()[0])
test['age_of_vehicle'] = test['age_of_vehicle'].fillna(test['age_of_vehicle'].mode()[0])

In [112]:
test['witness_present_ind']=test['witness_present_ind'].astype(int)
test['marital_status']=test['marital_status'].astype(int)
test['claim_number']=test['claim_number'].astype(str)
test['annual_income']=test['annual_income'].astype(float)
test['zip_code']=test['zip_code'].astype(str)
test['gender'] = test['gender'].astype('category')
test['living_status'] = test['living_status'].astype('category')
test['accident_site'] = test['accident_site'].astype('category')
test['channel'] = test['channel'].astype('category')
test['vehicle_category'] = test['vehicle_category'].astype('category')

In [113]:
test.info()

In [114]:
test.shape

In [115]:
to_drop = ['claim_number','zip_code','claim_date','claim_day_of_week','vehicle_color']

test.drop(to_drop, inplace = True, axis = 1)
#Dropping the variable which we assumed is pointless in prediction so we could run our algorithm faster.

In [116]:
plt.figure(figsize = (18, 12))

corr = test.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(data = corr, mask = mask, annot = True, fmt = '.2g', linewidth = 1)
plt.show()
#Check for multicollinearity

In [117]:
test.drop(columns = ['age_of_driver'], inplace = True, axis = 1)

In [118]:
test.info()

In [119]:
cat_test=test.select_dtypes(include = ['object','category'])

In [120]:
cat_test.head()

In [121]:
for col in cat_test.columns:
    print(f"{col}: \n{cat_test[col].unique()}\n")

In [122]:
cat_test = pd.get_dummies(cat_test, drop_first = True)

In [123]:
cat_test.head()

In [124]:
num_test = test.select_dtypes(include = [('int64'),('float64')])

In [125]:
num_test.head()

In [126]:
test = pd.concat([num_test, cat_test], axis = 1)

In [127]:
test.head()

In [128]:
test.info()

In [129]:
sc = StandardScaler()
test = sc.fit_transform(test)

In [130]:
test_pred = sgbc.predict(test)

In [131]:
test_pred = pd.DataFrame(test_pred, columns= ['fraud'])

In [132]:
df= test_pred[['fraud']]

In [133]:
df.head()

In [134]:
df.to_csv('Prediction - SGB.csv' , index=False)