# Prudential Life Insurance Dataset

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
#loading dataset
df = pd.read_csv('prudential-life-insurance-assessment/train.csv')

# Data Exploration 
### The following variables are all categorical (nominal):

Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41

### The following variables are continuous:

Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5

### The following variables are discrete:

Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32

### Medical_Keyword_1-48 are dummy variables.

### Understanding the data

In [None]:
df.shape

In [None]:
df.tail()

In [None]:
df.info()

## Determine columns contain nulls values
### Summarise how many missing values are present in each column

In [None]:
cols_with_nan = [col for col in df.columns
                     if df[col].isnull().any()]


df[cols_with_nan].isna().sum()

### Converting the number of null values in columns to percentage

In [None]:
percent_missing = df.isnull().sum() * 100 / len(df)
list_to_drop = []
for col, percentage in zip(df.columns, percent_missing):
    if percentage != 0 and percentage > 30:
        list_to_drop.append(col)

### Filtering columns with null values < than 30%

In [None]:
print(list_to_drop)
df.drop(list_to_drop, axis = 1, inplace= True)

In [None]:
df

### Set Id as index and drop the column Id

In [None]:
df.set_index(df['Id'], inplace=True)

In [None]:
df.drop(['Id'], axis = 1, inplace=True)

In [None]:

df

In [None]:
numOfCol = df.columns.size
print(numOfCol)

### Columns in new Dataframe with null values 

In [None]:
cols_with_nan = [col for col in df.columns
                     if df[col].isnull().any()]

df[cols_with_nan].isna().sum()

### Imputing null value Columns

In [None]:
columns_to_be_imputed = ['Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6', 'Medical_History_1']

In [None]:
df['Employment_Info_1'].unique()

In [None]:
# mean is highly influenced by outliers, so going with median to impute the missing data
plt.plot(figsize=(20,10))
sns.boxplot(df['Employment_Info_1'])

In [None]:
sns.distplot(df['Employment_Info_1'])

In [None]:
(df['Employment_Info_4'].unique())

In [None]:
sns.boxplot(df['Employment_Info_4'])

In [None]:
sns.distplot(df['Employment_Info_4'])

In [None]:
df['Employment_Info_6'].unique()

In [None]:
sns.boxplot(df['Employment_Info_6'])

In [None]:
sns.distplot(df['Employment_Info_6'])

In [None]:
df['Medical_History_1'].unique() # -- discrete

In [None]:
sns.boxplot(df['Medical_History_1'])

In [None]:
sns.distplot(df['Medical_History_1'])

##  Using mean/median imputation according to the outliers

In [None]:
for i in range(len(columns_to_be_imputed)):
    if not columns_to_be_imputed[i] == 'Employment_Info_6':
        df[columns_to_be_imputed[i]] = df[columns_to_be_imputed[i]].fillna(df[columns_to_be_imputed[i]].median())
    else:
        df[columns_to_be_imputed[i]] = df[columns_to_be_imputed[i]].fillna(df[columns_to_be_imputed[i]].mean())    

In [None]:
cols_with_nan = [col for col in df.columns
                     if df[col].isnull().any()]

df[cols_with_nan].isna().sum()

### Converting Remaining Categorical Columns Using One-Hot Encoding

In [None]:
categorical_Col =  [col for col in df.columns if df[col].dtype =='object']
print('categorical columns:::', categorical_Col)

In [None]:
print(df['Product_Info_2'].unique())

In [None]:
df = pd.get_dummies(df, columns=['Product_Info_2'])
df

### Deterimining Correlation and Multicollinearility Between Numerical Columns

In [None]:
df[['Ht', 'Wt', 'BMI', 'Product_Info_4', 'Ins_Age', 'Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6']].corr()

In [None]:
plt.scatter(x=df['BMI'], y=df['Wt'])

In [None]:
# can drop Weight column, which is highly correlated with BMI
df.drop(['Wt'], inplace=True, axis = 1)
df

### Reducing Columns - Dimensionality Reduction

In [None]:
df.columns.get_loc("Medical_Keyword_1")

In [None]:
df.columns.get_loc("Medical_Keyword_48")

## Summing all Medical_Keyword Columns into one column
### (instead of dropping all columns with less values converting into one to preserve important details for categorical variables)

In [None]:
df['Medical_Keyword']=df.iloc[:,67:114].sum(axis=1)
df

In [None]:
# drop all Medical_Keyword columns
df.drop(df.iloc[:,67:114], inplace=True, axis = 1)
df

In [None]:
sns.set_color_codes()
sns.countplot(x=df['Response'])
print(df['Response'].value_counts())

### Feature Selection Using Mutual Information

In [None]:
# import the mutual_info_classif() 
from sklearn.feature_selection import mutual_info_classif

In [None]:
#function to calculate MI scores
def cal_mi_scores(X, y):
    mi_score = mutual_info_classif(X, y)
    mi_score = pd.Series(mi_score, name="MI Scores", index=X.columns)
    mi_score = mi_score.sort_values(ascending=False)
    return mi_score

In [None]:
#function for plotting mi_scores
def viz_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores of Columns")

In [None]:
# Calculate MI scores.
mi_scores = cal_mi_scores(df.loc[:,df.columns != 'Response'], df.loc[:,'Response'])

# Plot the MI scores obtained from the validation dataset.
plt.figure(dpi=100, figsize=(20,50))
viz_mi_scores(mi_scores)

In [None]:
list(zip(mi_scores.index, mi_scores))

## Feature selection using the MI score with threshold of 0.01

In [None]:
updated_cols = list(mi_scores[mi_scores > 0.01].index)
updated_cols.append('Response')
print(updated_cols)
len(updated_cols)

In [None]:
df = df[updated_cols]
df

In [None]:
X = df.loc[:, df.columns != 'Response']
X

### Subtracting 1 from response to convert it into classes 0-7

In [None]:
y = df.loc[:,'Response']-1
y

## Normalization of Data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='ovr')
model.fit(X_train, y_train)
pred= model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

disp.plot()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(pred, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
list(zip(X_test.columns, model.coef_[0]))

### Important Features in Logistic Regression Model

In [None]:
f_i = list(zip(X_test.columns, model.coef_[0]))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.show()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=)
dtc.fit(X_train,y_train)

In [None]:
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(dtc, out_file=None,
                      filled=True,feature_names=X_train.columns,class_names=['1','2', '3', '4', '5', '6', '7', '8'], rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph

In [None]:
pred = dtc.predict(X_test)
confusion_matrix(y_test, pred)

In [None]:
pred = dtc.predict(X_test)
confusion_matrix(y_test, pred)

In [None]:
accuracy_score(pred, y_test)

In [None]:
accuracy_score(pred, y_test)

In [None]:
print(classification_report(y_test, pred))

In [None]:
dtc2 = DecisionTreeClassifier(criterion='entropy')
dtc2.fit(X_train,y_train)

In [None]:
pred2 = dtc2.predict(X_test)
confusion_matrix(y_test, pred2)

In [None]:
accuracy_score(pred2, y_test)

In [None]:
print(classification_report(y_test, pred2))

# Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train.values.ravel())

In [None]:
pred3 = rfc.predict(X_test)
confusion_matrix(y_test, pred3)

In [None]:
accuracy_score(pred3, y_test)

In [None]:
print(classification_report(y_test, pred3))

list(zip(X_test.columns,rfc.feature_importances_))

### Important Features in Random Forest Model

In [None]:
f_i = list(zip(X_test.columns,rfc.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.show()

# Regressors
# XGBoost

In [None]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

In [None]:
pred = xgb_model.predict(X_test)
pred

In [None]:
import math
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test, pred)

In [None]:
r2_score(y_test, pred)

In [None]:
hpg = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
}

In [None]:
from sklearn.model_selection import GridSearchCV
gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = hpg,                        
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)
gsearch.fit(X_train,y_train)
gsearch.best_params_

In [None]:
xgb_model =  XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=500, objective='reg:squarederror')
xgb_model.fit(X_train, y_train)

In [None]:
pred = xgb_model.predict(X_test)
pred

In [None]:
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
mean_squared_error(y_test, pred)

In [None]:
r2_score(y_test, pred)

# SVM Regression

In [None]:
from sklearn import svm
model = svm.SVR(kernel='linear', C= 0.1)
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='linear', C= 1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='linear', C= 10)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='rbf', C= 0.1, gamma = 0.1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='rbf', C= 0.1, gamma = 1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='rbf', C= 1, gamma = 0.1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='rbf', C= 1, gamma = 1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
model = svm.SVR(kernel='rbf', C= 10, gamma = 1)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, [math.ceil(i) for i in pred])

In [None]:
mean_squared_error(y_test, pred)

In [None]:
r2_score(y_test, pred)

# Converting into binary classification problem

In [None]:
df

In [None]:
df['Updated_response'] = df['Response'].apply(lambda row: 1 if row == 8 else 0)
df['Updated_response']

In [None]:
df = df.drop(['Response'], axis = 1)

In [None]:
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2)

## Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
pred= model.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

## Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
pred= model.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

## RandomForest Classifier

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred= model.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)