In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score,precision_score, balanced_accuracy_score, log_loss, confusion_matrix,classification_report

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/Users/priyankac/Downloads/travel insurance.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.info()

### Define target Variable

In [None]:
data['Claim'].value_counts()

# Dataset seems to be very imbalanced
# some technique will have to be used to balance the data

In [None]:
data['Claim'].value_counts()[:].plot(kind='bar',color='tomato')
plt.title('Insurance Claimed')
plt.show()

In [None]:
data['target'] = np.where(data['Claim'] == 'Yes', 1, 0)

### Drop the Claim feature to retain the target column

In [None]:
data = data.drop(['Claim'], axis = 1)

In [None]:
data.dtypes

### defining the target and Independent features

In [None]:
Y = data[['target']]
X = data.drop(['target'], axis = 1)

### Get the claim rate 

In [None]:
Y.mean()

### Split features into Numerical and Categorical

In [None]:
df_num = X.select_dtypes(include = 'number')
df_char =X.select_dtypes(include = 'object')

In [None]:
def unique_levels(x):
    x = x.value_counts().count()
    return(x)

df_value_counts = pd.DataFrame(df_num.apply(lambda x : unique_levels(x)))

In [None]:
df_value_counts.columns = ['feature_levels']
df_value_counts.head()

In [None]:
def univariateAnalysis_numeric(column,nbins):
    plt.figure()
    print("Distribution of " + column)
    print("---------------------------------------------")
    sns.distplot(data[column], kde=False, color='g');
    plt.show()
    
    plt.figure()
    print("BoxPlot of " + column)
    print("---------------------------------------------")
    ax = sns.boxplot(x=data[column])
    plt.show()

In [None]:
lstnumericcolumns = list(df_num.columns.values)
for x in lstnumericcolumns:
    univariateAnalysis_numeric(x,20)

In [None]:
#**Plot the Pairplot for the dataset**
sns.pairplot(data[data.dtypes[(data.dtypes=='int64')|(data.dtypes=='float64')].index])
plt.show()

### Outlier Analysis

In [None]:
df_num.describe(percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75,0.99])

In [None]:
#check for outliers.
continuous=data.dtypes[(data.dtypes=='int64')|(data.dtypes=='float64')].index
data_plot=data[continuous]

data_plot.boxplot(figsize=(15,10), rot=90);

### Capping and Flooring of Outliers

In [None]:
def outlier_cap(x):
    x = x.clip(lower = x.quantile(0.01))
    x = x.clip(upper = x.quantile(0.99))
    return(x)

In [None]:
df_num = df_num.apply(lambda x : outlier_cap(x))

In [None]:
df_num.describe(percentiles = [0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.90,0.99])

### Missing Values Check

In [None]:
df_num.isnull().mean()

In [None]:
df_char.isnull().mean()

# the Gender column has 71% missing values, it will need to be dropped

In [None]:
df_char = df_char.drop(['Gender'], axis = 1)

In [None]:
df_char.dtypes

## Feature Selection - Numerical Features

### Part 1 : Remove Features with 0 Variance

In [None]:
from sklearn.feature_selection import VarianceThreshold
varselector = VarianceThreshold(threshold = 0)
varselector.fit_transform(df_num)

# Get columns to keep and create a new dataframe with those only
cols = varselector.get_support(indices = True)
num_1 = df_num.iloc[:, cols]

In [None]:
df_num.iloc[0]

### Part 2 : Bivariate Analysis

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
discrete = KBinsDiscretizer(n_bins = 10, encode = 'ordinal', strategy = 'quantile')
num_binned = pd.DataFrame(discrete.fit_transform(df_num), index = df_num.index, columns = df_num.columns).add_suffix('_Rank')
num_binned.tail()

In [None]:
# Check if the feature show a slope at all
# If they do, then do you see some deciles below the population average and some higher than population average?
# If that is the case then the slope will be strong

# Conclusion: A strong slope is indicative of the faetures' ability to descriminate the event from non event
#             making it a good predictor

X_bin_combined = pd.concat([Y, num_binned], axis = 1, join = 'inner')

from numpy import mean
for col in (num_binned.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_bin_combined['target'].mean(), data = X_bin_combined, color = 'red')
    sns.barplot(x = col, y = 'target', data = X_bin_combined, estimator = mean)
plt.show()    

In [None]:
# Since there are very few features and all show some slope all will be kept

In [None]:
select_features_df_num = df_num
select_features_df_num.shape

In [None]:
#plot correlation heatmap
plt.figure(figsize=(10,10))
sns.heatmap(round(df_num.corr(),2),fmt='0.2f', annot = True, cmap = 'YlGnBu')
plt.show()

## Feature Selection - Categorical Feature

In [None]:
def unique_levels(x):
    x = x.value_counts().count()
    return(x)

char_unique_levels = pd.DataFrame(df_char.apply(lambda x : unique_levels(x)))

In [None]:
char_unique_levels.columns = ['feature_levels']
char_unique_levels.head()

In [None]:
slice1 = char_unique_levels.loc[char_unique_levels['feature_levels'] > 1]
cat_list = slice1.index
df_char = df_char.loc[:, cat_list]

### Part 1: Bi Variate Analysis

In [None]:
X_char_merged = pd.concat([Y, df_char], axis = 1, join = 'inner')

from numpy import mean
for col in (df_char.columns):
    plt.figure()
    sns.lineplot(x = col, y = X_char_merged['target'].mean(), data = X_char_merged, color = 'red')
    sns.barplot(x = col, y = 'target', data = X_char_merged, estimator = mean)
plt.show()    

In [None]:
char_1 = df_char

In [None]:
char_1.dtypes

In [None]:
# Create dummy features with n-1 levels
X_char_dum = pd.get_dummies(char_1, drop_first = True)
X_char_dum.shape

### Part 2 : Select K Best

In [None]:
# Select K Best for Categorical Features
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k = 120)
selector.fit_transform(X_char_dum, Y)

# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices = True)
select_features_df_char = X_char_dum.iloc[:, cols]

In [None]:
select_features_df_char.iloc[0]

## Creating the Master Feature Set for Model Development

In [None]:
X_all = pd.concat([select_features_df_char, select_features_df_num], axis = 1, join = 'inner')

In [None]:
!pip install imbalanced-learn

In [None]:
# Oversample and plot imbalanced dataset with SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state=42)
Xb, yb = sm.fit_resample(X_all,Y)
print(f'''Shape of X before SMOTE: {X_all.shape}
Shape of X after SMOTE: {Xb.shape}''')
print('\nBalance of positive and negative classes (%):')
yb.value_counts(normalize=True) * 100

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xb,yb,test_size=0.3,random_state=42,shuffle=True,stratify=yb)

## Model Building

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
values=[]
models = [RandomForestClassifier(),LogisticRegression(),DecisionTreeClassifier(random_state=42),KNeighborsClassifier()]
for m in models:
  m.fit(X_train,y_train)
  y_pred=m.predict(X_test)
  print(m)
  print(classification_report(y_test,y_pred)[1])
  print(confusion_matrix(y_test,y_pred))
  values.append([str(m)[:10],f1_score(y_test,y_pred), roc_auc_score(y_test,y_pred), recall_score(y_test,y_pred), precision_score(y_test,y_pred), 
      balanced_accuracy_score(y_test,y_pred), log_loss(y_test,y_pred)])
  print('==========================================================')


In [None]:
values.insert(0,['Model','f1_score','roc_auc_score','recall_score','precision_score','balanced_accuracy_score','log_loss'])
results= pd.DataFrame(values[1:],columns=values[0])

In [None]:
results[['Model','f1_score','roc_auc_score','recall_score']]


In [None]:
results[['precision_score','balanced_accuracy_score','log_loss']]
