# Root Insurance Erdos Data Camp Project

## Import packages 

In [None]:
# Import the package and data
import pandas as pd
import numpy as np 

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

## 1. Summary statistics

In [None]:
## check the data 
df = pd.read_csv('../0-data/Root_Insurance_data.csv')
df.head()

In [None]:
## check the correlation
df.corr()

descriptive information about variables by rank 

In [None]:
## summary statistics of all other variables based on Rank  
df.groupby(['rank']).agg(
{'Currently Insured':"count",
 'Number of Vehicles':[min,max,sum],
 'Number of Drivers':[min,max,sum],
 'Marital Status':"count",
 'click':"count",
 'policies_sold':"sum"
}
)

In [None]:
## summary statistics of all other variables based on Rank and Click 
df.groupby(['rank','click']).agg(
{'click':"sum",
 'Currently Insured':"count",
 'Number of Vehicles':[min,max,sum],
 'Number of Drivers':[min,max,sum],
 'Marital Status':"count",
 'policies_sold':"sum"
}
)

# any thoughts here? feel free to add yours 
# company can sell the policy only if the ad is clicked 
# increase the rank based on attributes  

In [None]:
## calculate click-through-rate (CTR)
### It is the ratio of sum of the number of clicks over the number of total observations.
ctr = df.click.value_counts("True")
print(ctr)

## 2. Simple logistic regression

In [None]:
## import the package
import statsmodels.api as sm

In [None]:
## Data clean
## Currently Insured: categorical/unknow, N, Y
## Number of Vehicles: ordinal
## Number of Drivers: ordinal
## Marital Status: categorical/ M, S
## rank: ordinal

## There are two ways to run regressions when having categorial variables in datasets.
## 1. Delete those categorical variables.
## 2. Keep them but generate dummy variables. 
## since we don't have much features in our dataset, i would go for dummies.

## Create dummies for Currently Insured and Marital Status. 
## 1. For Currently Insured, i left "unknow" as baseline case. So Insured_N, and Insured_Y are created and included. 
## 2. For Marital Status, i left "S" as baseline case. So Married are created and included. 
## For the other variables I would treat them as ordinal.

pd.get_dummies(df['Currently Insured'])
df['Insured_N'] = pd.get_dummies(df['Currently Insured'])['N']
df['Insured_Y'] = pd.get_dummies(df['Currently Insured'])['Y']

pd.get_dummies(df['Marital Status'])
df['Married'] = pd.get_dummies(df['Marital Status'])['M']

pd.get_dummies(df['click'])
df['click_true'] = pd.get_dummies(df['click'])[True]

df.head()


### 2.1. click 

#### 2.1.1 click without interactions

In [None]:
## define X and Y

X = df[['Insured_N', 'Insured_Y', 'Number of Vehicles', 'Number of Drivers', 'Married','rank']].copy()
X = sm.add_constant(X)
y = df['click_true'].copy()

## define and fit the model
model = sm.Logit(y, X)
result = model.fit(method='newton')


In [None]:
## result summary
result.summary()

#### 2.1.1 click with interactions

### 2.2. policies_sold

#### 2.2.1. policies_sold without interactions

In [None]:
## 2.2 the outcome variable is "policies_sold"
y = df['policies_sold'].copy()

## define and fit the model
model = sm.Logit(y, X)
result = model.fit(method='newton')



In [None]:
y = df['policies_sold'].copy()

## define and fit the model
model = sm.Logit(y, X)
result = model.fit(method='newton'

#### 2.2.1 policies_sold with interactions

## 3. Classification

### 3.1 Logistic regression

In [None]:
## 3.1 Logistic regression
## import the package 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Accuracy
from sklearn.metrics import accuracy_score
## Precision
from sklearn.metrics import precision_score
## Recall
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
## train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=.2,
                                                shuffle=True,
                                                random_state=614,
                                                stratify=y)

In [None]:
## fit the model by using train dataset
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [None]:
def get_acc(y_pred,y_actual):
    return np.sum(y_pred == y_actual)/len(y_actual)

In [None]:
## cross validation
kfold = StratifiedKFold(5,shuffle=True,random_state=440)


## cut-offs 
cutoffs = np.arange(0,1,.01)

accs = np.zeros((5,len(cutoffs)))
log_reg = LogisticRegression()

i = 0

for train_index,test_index in kfold.split(X_train,y_train):
    X_train_train,X_train_test = X_train.iloc[train_index],X_train.iloc[test_index]
    y_train_train,y_train_test = y_train.iloc[train_index],y_train.iloc[test_index]
    
    
    for j in range(len(cutoffs)):
        log_reg_clone = clone(log_reg)
        log_reg_clone.fit(X_train_train,y_train_train)
        probs = log_reg_clone.predict_proba(X_train_test)[:,1]
        
        y_pred = 1*(probs > cutoffs[j])
        accs[i,j] = get_acc(y_pred,y_train_test)
        
    i=i+1

In [None]:
## plot the figure

plt.figure(figsize=(10,8))


plt.plot(cutoffs,np.mean(accs,axis=0))

plt.xlabel("Cutoff", fontsize=16)
plt.ylabel("Mean CV Accuracy", fontsize=16)
plt.show()

In [None]:
print("The cutoff with highest mean CV accuracy was",
         cutoffs[np.argmax(np.mean(accs,axis=0))])