In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score


In [None]:
train_df = pd.read_csv('E:/prepare for graduate school/Python/Kaggel/Travelers Competition/uconn_comp_2018_train.csv')
test_df = pd.read_csv('E:/prepare for graduate school/Python/Kaggel/Travelers Competition/uconn_comp_2018_test.csv')
combine = [train_df, test_df]

#### See first 5 rows and all columns

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
train_df.head()

#### Describe numeric columns

In [None]:
train_df.describe()

#### Describe categorical columns

In [None]:
train_df.describe(include=['O'])

#### See column data types

In [None]:
train_df.dtypes

#### See how many rows and columns are in this dataset

In [None]:
train_df.shape

#### See how many rows are left after we drop rows that contains missing value

In [None]:
train_df.dropna().shape

#### We loose 162 rows of data, 0.9% of data. Not a big lose
#### We can see how many missing value we have in each column

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

#### For categorical variables, we can change their type to category and save memory(not necessary)

we can look at how much memory are they using now

In [None]:
train_df.living_status.memory_usage()

In [None]:
train_df['living_status'] = train_df.living_status.astype('category')

In [None]:
train_df.claim_day_of_week.memory_usage()

In [None]:
train_df['claim_day_of_week'] = train_df.claim_day_of_week.astype('category')

In [None]:
train_df.accident_site.memory_usage()

In [None]:
train_df['accident_site'] = train_df.accident_site.astype('category')

#### Since we are not having a huge dataset, we will leave the following first

In [None]:
train_df.channel.memory_usage()
train_df.vehicle_category.memory_usage()
train_df.vehicle_color.memory_usage()

## Analyze by pivoting features and visualizing data

In [None]:
train_df = train_df.loc[train_df['fraud']!=-1]

In [None]:
sns.heatmap(train_df.corr())

In [None]:
grid = sns.FacetGrid(train_df,col='fraud')
grid.map(plt.hist,'age_of_driver',bins=20)

In [None]:
train_df[['gender','fraud']].groupby(['gender'],as_index=False).mean()

In [None]:
train_df[['marital_status','fraud']].groupby(['marital_status'],as_index=False).mean()

In [None]:
grid = sns.FacetGrid(train_df,col='fraud')
grid.map(plt.hist,'safty_rating',bins=10)

In [None]:
grid = sns.FacetGrid(train_df,col='fraud')
grid.map(plt.hist,'annual_income',bins=5000)

In [None]:
train_df[['high_education_ind','fraud']].groupby(['high_education_ind'],as_index=False).mean()

In [None]:
train_df[['address_change_ind','fraud']].groupby(['address_change_ind'],as_index=False).mean()

In [None]:
train_df[['living_status','fraud']].groupby(['living_status'],as_index=False).mean()

In [None]:
train_df[['claim_date','fraud']].groupby(['claim_date'],as_index=False).mean()

In [None]:
train_df[['claim_day_of_week','fraud']].groupby(['claim_day_of_week'],as_index=False).mean()

In [None]:
train_df[['accident_site','fraud']].groupby(['accident_site'],as_index=False).mean()

In [None]:
train_df[['past_num_of_claims','fraud']].groupby(['past_num_of_claims'],as_index=False).mean().sort_values('past_num_of_claims')

In [None]:
train_df[['witness_present_ind','fraud']].groupby(['witness_present_ind'],as_index=False).mean()

In [None]:
grid = sns.FacetGrid(train_df,col='fraud')
grid.map(plt.hist,'liab_prct')

In [None]:
train_df[['channel','fraud']].groupby(['channel'],as_index=False).mean()

#### Almost no difference, can be excluded

In [None]:
train_df[['policy_report_filed_ind','fraud']].groupby(['policy_report_filed_ind'],as_index=False).mean()

In [None]:
train_df[['age_of_vehicle','fraud']].groupby(['age_of_vehicle'],as_index=False).mean()

In [None]:
train_df[['vehicle_category','fraud']].groupby(['vehicle_category'],as_index=False).mean()

In [None]:
grid = sns.FacetGrid(train_df,col='fraud')
grid.map(plt.hist,'vehicle_price',bins=3000)

In [None]:
train_df[['vehicle_price','fraud']].groupby(['vehicle_price'],as_index=False).mean()

In [None]:
train_df[['vehicle_color','fraud']].groupby(['vehicle_color'],as_index=False).mean()

In [None]:
grid = sns.FacetGrid(train_df,col='fraud')
grid.map(plt.hist,'vehicle_weight',bins=3000)

In [None]:
train_df[['vehicle_weight','fraud']].groupby(['vehicle_weight'],as_index=False).mean()

#### Difference is very small

## Wrangle Data

### age_of_driver
Replace ages over 100 with mean value calculated by age under or equals 100

In [None]:
train_df[train_df.age_of_driver <= 100].age_of_driver.mean()

In [None]:
train_df.loc[train_df.age_of_driver > 100,'age_of_driver'] = 43

train_df['DriverAgeBand'] = pd.cut(train_df['age_of_driver'], 7)
train_df[['DriverAgeBand', 'fraud']].groupby(['DriverAgeBand'], as_index=False).mean().sort_values(by='DriverAgeBand', ascending=True)

for dataset in combine:    
    dataset.loc[ dataset['age_of_driver'] <= 30, 'age_of_driver'] = 0
    dataset.loc[(dataset['age_of_driver'] > 30) & (dataset['age_of_driver'] <= 41), 'age_of_driver'] = 1
    dataset.loc[(dataset['age_of_driver'] > 41) & (dataset['age_of_driver'] <= 53), 'age_of_driver'] = 2
    dataset.loc[(dataset['age_of_driver'] > 53) & (dataset['age_of_driver'] <= 64), 'age_of_driver'] = 3
    dataset.loc[(dataset['age_of_driver'] > 64) & (dataset['age_of_driver'] <= 76), 'age_of_driver'] = 4
    dataset.loc[(dataset['age_of_driver'] > 76) & (dataset['age_of_driver'] <= 88), 'age_of_driver'] = 5
    dataset.loc[ dataset['age_of_driver'] > 88, 'age_of_driver'] = 6
    
train_df = train_df.drop(['DriverAgeBand'], axis=1)
combine = [train_df, test_df]

In [None]:
train_df.head()

In [None]:
train_df[['age_of_driver','fraud']].groupby(['age_of_driver'],as_index=False).mean()

### gender

In [None]:
train_df.gender = train_df.gender.map({'M':1,'F':0})
test_df.gender = test_df.gender.map({'M':1,'F':0})

### marital_status

Because marital_status is a 1/0, it is hard for us to impute, and there are only 5 rows of missing value, so we will simply delete those rows

In [None]:
train_df.dropna(subset=['marital_status'],inplace=True)
test_df.dropna(subset=['marital_status'],inplace=True)

### Safty_rating, annual_income, high_education_ind, address_change_ind
These columns are good

In [None]:
train_df.safty_rating.describe()

In [None]:
train_df['SaftyBand'] = pd.cut(train_df['safty_rating'], 5)
train_df[['SaftyBand', 'fraud']].groupby(['SaftyBand'], as_index=False).mean().sort_values(by='SaftyBand', ascending=True)

for dataset in combine:    
    dataset.loc[ dataset['safty_rating'] <= 20, 'safty_rating'] = 0
    dataset.loc[(dataset['safty_rating'] > 20) & (dataset['safty_rating'] <= 40), 'safty_rating'] = 1
    dataset.loc[(dataset['safty_rating'] > 40) & (dataset['safty_rating'] <= 60), 'safty_rating'] = 2
    dataset.loc[(dataset['safty_rating'] > 60) & (dataset['safty_rating'] <= 80), 'safty_rating'] = 3
    dataset.loc[dataset['safty_rating'] > 80 , 'safty_rating'] = 4
    
train_df = train_df.drop(['SaftyBand'], axis=1)
combine = [train_df, test_df]

In [None]:
train_df[['safty_rating','fraud']].groupby(['safty_rating'],as_index=False).mean()

### annual_income

In [None]:
train_df[train_df.annual_income > 0].annual_income.mean()

In [None]:
train_df.loc[train_df.annual_income < 0,'annual_income'] = 37398

train_df.loc[:,'annual_income_band'] = pd.cut(train_df['annual_income'], 5)
train_df[['annual_income_band', 'fraud']].groupby(['annual_income_band'], as_index=False).mean().sort_values(by='annual_income_band', ascending=True)

for dataset in combine:
    dataset.loc[ dataset['annual_income'] <= 28895, 'annual_income'] = 0
    dataset.loc[(dataset['annual_income'] > 28895) & (dataset['annual_income'] <= 34972), 'annual_income'] = 1
    dataset.loc[(dataset['annual_income'] > 34972) & (dataset['annual_income'] <= 36890), 'annual_income'] = 2
    dataset.loc[(dataset['annual_income'] > 36890) & (dataset['annual_income'] <= 38282), 'annual_income'] = 3
    dataset.loc[(dataset['annual_income'] > 38282) & (dataset['annual_income'] <= 39707), 'annual_income'] = 4
    dataset.loc[ dataset['annual_income'] > 39707, 'annual_income'] = 5
    dataset['annual_income'] = dataset['annual_income'].astype(int)

train_df = train_df.drop(['annual_income_band'], axis=1)
combine = [train_df, test_df]

In [None]:
train_df.head()

### Living_status

In [None]:
train_df.living_status = train_df.living_status.map({'Own':1,'Rent':0})
test_df.living_status = test_df.living_status.map({'Own':1,'Rent':0})

### Zip_code is a little hard for me right now, just leave it there first

In [None]:
train_df.drop('zip_code',axis=1,inplace=True)
test_df.drop('zip_code',axis=1,inplace=True)

### Claim_date
Extract month

In [None]:
train_df.claim_date = pd.to_datetime(train_df.claim_date).dt.month
test_df.claim_date = pd.to_datetime(test_df.claim_date).dt.month

### claim_day_of_week

In [None]:
train_df.claim_day_of_week = pd.to_datetime(train_df.claim_date).dt.dayofweek
test_df.claim_day_of_week = pd.to_datetime(test_df.claim_date).dt.dayofweek

### accident_site

In [None]:
train_df.accident_site = train_df.accident_site.map({'Local':1,'Parking Lot':2,'Highway':3})
test_df.accident_site = test_df.accident_site.map({'Local':1,'Parking Lot':2,'Highway':3})

### witness_present_ind

In [None]:
train_df.witness_present_ind = train_df.witness_present_ind.fillna(0)
test_df.witness_present_ind = test_df.witness_present_ind.fillna(0)

### liab_prct

In [None]:
train_df['LiabBand'] = pd.cut(train_df['liab_prct'], 4)
train_df[['LiabBand', 'fraud']].groupby(['LiabBand'], as_index=False).mean().sort_values(by='LiabBand', ascending=True)

for dataset in combine:
    dataset.loc[ dataset['liab_prct'] <= 25, 'liab_prct'] = 0
    dataset.loc[(dataset['liab_prct'] > 25) & (dataset['liab_prct'] <= 50), 'liab_prct'] = 1
    dataset.loc[(dataset['liab_prct'] > 50) & (dataset['liab_prct'] <= 75), 'liab_prct'] = 2
    dataset.loc[ dataset['liab_prct'] > 75, 'liab_prct'] = 3
    dataset['liab_prct'] = dataset['liab_prct'].astype(int)

train_df = train_df.drop(['LiabBand'], axis=1)
combine = [train_df, test_df]

### channel

In [None]:
train_df.channel = train_df.channel.map({'Broker':1,'Phone':2,'Online':3})
test_df.channel = test_df.channel.map({'Broker':1,'Phone':2,'Online':3})

### claim_est_payout

In [None]:
train_df[train_df.claim_est_payout > 100].claim_est_payout.mean()
test_df[test_df.claim_est_payout > 100].claim_est_payout.mean()

In [None]:
train_df.claim_est_payout.fillna(4976,inplace=True)
test_df.claim_est_payout.fillna(6758,inplace=True)

train_df['claim_est_payout_band'] = pd.cut(train_df['claim_est_payout'], 5)
train_df[['claim_est_payout_band', 'fraud']].groupby(['claim_est_payout_band'], as_index=False).mean().sort_values(by='claim_est_payout_band', ascending=True)

for dataset in combine:
    dataset.loc[ dataset['claim_est_payout'] <= 3065, 'claim_est_payout'] = 0
    dataset.loc[(dataset['claim_est_payout'] > 3065) & (dataset['claim_est_payout'] <= 4141), 'claim_est_payout'] = 1
    dataset.loc[(dataset['claim_est_payout'] > 4141) & (dataset['claim_est_payout'] <= 5226), 'claim_est_payout'] = 2
    dataset.loc[(dataset['claim_est_payout'] > 5226) & (dataset['claim_est_payout'] <= 6684), 'claim_est_payout'] = 3
    dataset.loc[ dataset['claim_est_payout'] > 6684, 'claim_est_payout'] = 4
    dataset['claim_est_payout'] = dataset['claim_est_payout'].astype(int)

train_df = train_df.drop(['claim_est_payout_band'], axis=1)
combine = [train_df, test_df]

### age_of_vehicle

In [None]:
train_df.age_of_vehicle.describe()
test_df.age_of_vehicle.describe()

In [None]:
train_df.age_of_vehicle.fillna(5,inplace=True)
test_df.age_of_vehicle.fillna(5,inplace=True)

### vehicle_category

In [None]:
train_df.vehicle_category = train_df.vehicle_category.map({'Compact':1,'Large':2,'Medium':3})
test_df.vehicle_category = test_df.vehicle_category.map({'Compact':1,'Large':2,'Medium':3})

### vehicle_price

In [None]:
train_df['vehicle_price_band'] = pd.cut(train_df['vehicle_price'], 7)
train_df[['vehicle_price_band', 'fraud']].groupby(['vehicle_price_band'], as_index=False).mean().sort_values(by='vehicle_price_band', ascending=True)

for dataset in combine:
    dataset.loc[ dataset['vehicle_price'] <= 20258, 'vehicle_price'] = 0
    dataset.loc[(dataset['vehicle_price'] > 20258) & (dataset['vehicle_price'] <= 38059), 'vehicle_price'] = 1
    dataset.loc[(dataset['vehicle_price'] > 38059) & (dataset['vehicle_price'] <= 55859), 'vehicle_price'] = 2
    dataset.loc[(dataset['vehicle_price'] > 55859) & (dataset['vehicle_price'] <= 73660), 'vehicle_price'] = 3
    dataset.loc[(dataset['vehicle_price'] > 73660) & (dataset['vehicle_price'] <= 91461), 'vehicle_price'] = 4
    dataset.loc[(dataset['vehicle_price'] > 91461) & (dataset['vehicle_price'] <= 109262), 'vehicle_price'] = 5
    dataset.loc[ dataset['vehicle_price'] > 109262, 'vehicle_price'] = 6
    dataset['vehicle_price'] = dataset['vehicle_price'].astype(int)

train_df = train_df.drop(['vehicle_price_band'], axis=1)
combine = [train_df, test_df]

### vehicle_color

In [None]:
train_df.vehicle_color = train_df.vehicle_color.map({'black':1,'silver':2,'white':3,'red':4,'blue':5,'gray':6,'other':7})
test_df.vehicle_color = test_df.vehicle_color.map({'black':1,'silver':2,'white':3,'red':4,'blue':5,'gray':6,'other':7})

### vehicle_weight

In [None]:
train_df['vehicle_weight_band'] = pd.cut(train_df['vehicle_weight'], 5)
train_df[['vehicle_weight_band', 'fraud']].groupby(['vehicle_weight_band'], as_index=False).mean().sort_values(by='vehicle_weight_band', ascending=True)

for dataset in combine:
    dataset.loc[ dataset['vehicle_weight'] <= 26546, 'vehicle_weight'] = 0
    dataset.loc[(dataset['vehicle_weight'] > 26546) & (dataset['vehicle_weight'] <= 50664), 'vehicle_weight'] = 1
    dataset.loc[(dataset['vehicle_weight'] > 50664) & (dataset['vehicle_weight'] <= 74781), 'vehicle_weight'] = 2
    dataset.loc[(dataset['vehicle_weight'] > 74781) & (dataset['vehicle_weight'] <= 98899), 'vehicle_weight'] = 3
    dataset.loc[ dataset['vehicle_weight'] > 98899, 'vehicle_weight'] = 4
    dataset['vehicle_weight'] = dataset['vehicle_weight'].astype(int)

train_df = train_df.drop(['vehicle_weight_band'], axis=1)
combine = [train_df, test_df]

## Model, predict and solve

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
X = train_df.drop('fraud', axis=1)
y = np.ravel(np.array(train_df[['fraud']]))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 29)

In [None]:
k_range = range(1,31)
para_grid = dict(n_neighbors=k_range)

grid = GridSearchCV(knn, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

In [None]:
grid.fit(X, y)

In [None]:
print(grid.best_score_)
print(grid.best_params_)

In [None]:
scores = cross_val_score(knn, X, y, cv=10, scoring='roc_auc')
print(scores)

np.mean(scores)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.1, solver='liblinear')

In [None]:
C_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
para_grid = dict(C=C_range)

grid = GridSearchCV(lr, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

In [None]:
grid.fit(X, y)

In [None]:
print(grid.best_score_)
print(grid.best_params_)

In [None]:
scores2 = cross_val_score(lr, X, y, cv=10, scoring='roc_auc')
print(scores2)

np.mean(scores2)

### Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

In [None]:
scores3 =  cross_val_score(gnb, X, y, cv=10, scoring='roc_auc')
print(scores3)

np.mean(scores3)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth=7, min_samples_split=0.1, min_samples_leaf=0.1, max_features=13)

In [None]:
md = range(1,31)
para_grid = dict(max_depth=md)

grid = GridSearchCV(dtc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
mss = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
para_grid = dict(min_samples_split=mss)

grid = GridSearchCV(dtc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
msl = np.linspace(0.1, 0.5, 5, endpoint=True)
para_grid = dict(min_samples_leaf=msl)

grid = GridSearchCV(dtc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
mf = range(1, X.shape[1])
para_grid = dict(max_features=mf)

grid = GridSearchCV(dtc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
scores4 =  cross_val_score(dtc, X, y, cv=10, scoring='roc_auc')
print(scores4)

np.mean(scores4)

### Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_features=16, max_depth=6)

In [None]:
n_estimators = range(200,2200,200)
para_grid = dict(n_estimators=n_estimators)

grid = GridSearchCV(rfc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
mf = range(1, X.shape[1])
para_grid = dict(max_features=mf)

grid = GridSearchCV(rfc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
md = range(1,31)
para_grid = dict(max_depth=md)

grid = GridSearchCV(rfc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
scores5 =  cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')
print(scores5)

np.mean(scores5)

### Gradient Boosted Decision Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(learning_rate=0.1)

In [None]:
lr = [0.01, 0.1, 1, 10, 100]
para_grid = dict(learning_rate=lr)

grid = GridSearchCV(gbc, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
scores6 =  cross_val_score(gbc, X, y, cv=10, scoring='roc_auc')
print(scores6)

np.mean(scores6)

In [None]:
gbc.fit(X_scaled,y)
print(gbc.feature_importances_)

In [None]:
feat_importances = pd.Series(gbc.feature_importances_, index=X.columns).sort_values(ascending=False)
feat_importances.plot(kind='bar')

### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

In [None]:
act = ['relu', 'logistic', 'tanh']
para_grid = dict(activation=act)

grid = GridSearchCV(mlp, para_grid, cv=10, scoring='roc_auc',n_jobs=-1)

grid.fit(X, y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
scores7 =  cross_val_score(gbc, X, y, cv=10, scoring='roc_auc')
print(scores7)

np.mean(scores7)

### XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

params = {
    'objective': 'binary:logistic',
    'max_depth': 2,
    'learning_rate': 1.0,
    'silent': 1.0,
    'n_estimators': 5
}

xgb = XGBClassifier(**params)

In [None]:
scores8 =  cross_val_score(xgb, X, y, cv=10, scoring='roc_auc')
print(scores8)

np.mean(scores8)

## Model Comparison

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression'
              , 'Naive Bayes', 'Decision Tree', 
              'Random Forest', 'Gradient Bossted Decision Trees', 
              'Neural Network', 'XGBoost'],
    'Score': [np.mean(scores), np.mean(scores2), np.mean(scores3), np.mean(scores4), np.mean(scores5), np.mean(scores6), np.mean(scores7), np.mean(scores8)]})
models.sort_values(by='Score', ascending=False)