## Logit models with policies sold as dependent variable
This is version 1 where we use the full sample (N=10000), and also use click as variable. I consider rank as continuous variables, and rest as categorical variables.  

In [85]:
## For data handling
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer


## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

In [86]:
df = pd.read_csv('../0-data/Root_Insurance_data.csv')

# Unlike the classification tree example, I am not generating dummies

df.head()

Unnamed: 0,Currently Insured,Number of Vehicles,Number of Drivers,Marital Status,bid,rank,click,policies_sold
0,unknown,2,1,M,10.0,2,False,0
1,Y,3,1,M,10.0,5,False,0
2,Y,3,2,M,10.0,5,False,0
3,N,3,2,S,10.0,4,False,0
4,unknown,2,2,S,10.0,2,False,0


In [87]:
# statsmodels formula version allows writing the model in terms of equation 
# Model 1: No frills model with all the explanatory variables
policies_model_1=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured"))+rank+click', data=df)


# Model 2: Interacting marital status with rank
policies_model_2=smf.logit(formula='policies_sold~ C(Q("Marital Status")):rank+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured"))+click', data=df)

# Model 3: Interacting number of vehicles with rank
policies_model_3=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles")):rank+C(Q("Number of Drivers"))+C(Q("Currently Insured"))+click', data=df)

# Model 4: Interacting number of drivers with rank
policies_model_4=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers")):rank+C(Q("Currently Insured"))+click', data=df)

# Model 5: Interacting currently insured with rank
policies_model_5=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured")):rank+click', data=df)

Stargazer([policies_model_1.fit(), policies_model_2.fit(), policies_model_3.fit(), policies_model_4.fit(), policies_model_5.fit()])


         Current function value: 0.125198
         Iterations: 35
         Current function value: 0.125181
         Iterations: 35
         Current function value: 0.125589
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.125396
         Iterations 30
         Current function value: 0.125000
         Iterations: 35


0,1,2,3,4,5
,,,,,
,Dependent variable:policies_sold,Dependent variable:policies_sold,Dependent variable:policies_sold,Dependent variable:policies_sold,Dependent variable:policies_sold
,,,,,
,(1),(2),(3),(4),(5)
,,,,,
"C(Q(""Currently Insured""))[N]:rank",,,,,0.104**
,,,,,(0.053)
"C(Q(""Currently Insured""))[T.Y]",-0.606***,-0.608***,-0.592***,-0.592***,
,(0.149),(0.149),(0.149),(0.148),
"C(Q(""Currently Insured""))[T.unknown]",0.052,0.051,-0.016,0.086,


In [88]:
# statsmodels formula version allows writing the model in terms of equation
# 
df_clicks=df[df["click"]==True] 
# Model 1: No frills model with all the explanatory variables
policies_model_1=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured"))+rank', data=df_clicks)


# Model 2: Interacting marital status with rank
policies_model_2=smf.logit(formula='policies_sold~ C(Q("Marital Status")):rank+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured"))', data=df_clicks)

# Model 3: Interacting number of vehicles with rank
policies_model_3=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles")):rank+C(Q("Number of Drivers"))+C(Q("Currently Insured"))', data=df_clicks)

# Model 4: Interacting number of drivers with rank
policies_model_4=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers")):rank+C(Q("Currently Insured"))', data=df_clicks)

# Model 5: Interacting currently insured with rank
policies_model_5=smf.logit(formula='policies_sold~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured")):rank', data=df_clicks)

Stargazer([policies_model_1.fit(), policies_model_2.fit(), policies_model_3.fit(), policies_model_4.fit(), policies_model_5.fit()])


Optimization terminated successfully.
         Current function value: 0.666656
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.666565
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.668736
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.667711
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665600
         Iterations 5


0,1,2,3,4,5
,,,,,
,Dependent variable:policies_sold,Dependent variable:policies_sold,Dependent variable:policies_sold,Dependent variable:policies_sold,Dependent variable:policies_sold
,,,,,
,(1),(2),(3),(4),(5)
,,,,,
"C(Q(""Currently Insured""))[N]:rank",,,,,0.104**
,,,,,(0.053)
"C(Q(""Currently Insured""))[T.Y]",-0.606***,-0.608***,-0.592***,-0.592***,
,(0.149),(0.149),(0.149),(0.148),
"C(Q(""Currently Insured""))[T.unknown]",0.052,0.051,-0.016,0.086,


## Logit models with clicks as dependent variable

In [90]:
# statsmodels formula version allows writing the model in terms of equation 
# Model 1: No frills model with all the explanatory variables
pd.get_dummies(df['click'])
df['click_true'] = pd.get_dummies(df['click'])[True]

click_model_1=smf.logit(formula='click_true~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured"))+rank', data=df)


# Model 2: Interacting marital status with rank
click_model_2=smf.logit(formula='click_true~ C(Q("Marital Status")):rank+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured"))', data=df)

# Model 3: Interacting number of vehicles with rank
click_model_3=smf.logit(formula='click_true~ C(Q("Marital Status"))+ C(Q("Number of Vehicles")):rank+C(Q("Number of Drivers"))+C(Q("Currently Insured"))', data=df)

# Model 4: Interacting number of drivers with rank
click_model_4=smf.logit(formula='click_true~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers")):rank+C(Q("Currently Insured"))', data=df)

# Model 5: Interacting currently insured with rank
click_model_5=smf.logit(formula='click_true~ C(Q("Marital Status"))+ C(Q("Number of Vehicles"))+C(Q("Number of Drivers"))+C(Q("Currently Insured")):rank', data=df)

Stargazer([click_model_1.fit(), click_model_2.fit(), click_model_3.fit(), click_model_4.fit(), click_model_5.fit()])


Optimization terminated successfully.
         Current function value: 0.389531
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.389734
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.389574
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.389528
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.389537
         Iterations 7


0,1,2,3,4,5
,,,,,
,Dependent variable:click_true,Dependent variable:click_true,Dependent variable:click_true,Dependent variable:click_true,Dependent variable:click_true
,,,,,
,(1),(2),(3),(4),(5)
,,,,,
"C(Q(""Currently Insured""))[N]:rank",,,,,-0.922***
,,,,,(0.030)
"C(Q(""Currently Insured""))[T.Y]",0.111,0.113,0.117,0.112,
,(0.082),(0.081),(0.081),(0.081),
"C(Q(""Currently Insured""))[T.unknown]",0.067,0.062,0.053,0.069,
