In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
leads = pd.read_csv("C:/Users/PaulB/OneDrive/Desktop/First Capstone/clean_leads.csv", index_col = 'Lead Number')

In [3]:
leads.head()

Unnamed: 0_level_0,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Quick Add Form,Lead Source_Blog,Lead Source_Click2call,Lead Source_Direct traffic,...,City_Other Cities,City_Other Cities of Maharashtra,City_Other Metro Cities,City_Thane & Outskirts,City_Tier II Cities,Asymmetrique Activity Index_02.Medium,Asymmetrique Activity Index_03.Low,Asymmetrique Profile Index_02.Medium,Asymmetrique Profile Index_03.Low,A free copy of Mastering The Interview_Yes
Lead Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
660728,0,5.0,674,2.5,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
660727,1,2.0,1532,2.0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
660719,0,1.0,305,1.0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
660681,1,2.0,1428,1.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
660673,1,2.0,1640,2.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [4]:
# Experimenting with removing all nonsignficant variables to see if that made a difference
#leads = leads.drop(['Last Activity_Form Submitted on Website','Specialization_Services Excellence', 
#                    'What is your current occupation_Student', 'Last Activity_Had a Phone Conversation',
#                    'Lead Origin_Lead Add Form', 'Last Activity_Email Bounced','Lead Source_Welingak website',
#                    'City_Tier II Cities','Lead Source_Olark chat','What is your current occupation_Unemployed',
#                   'Last Activity_Page Visited on Website'], axis =1)

In [5]:
#Splitting training and testing data
X= leads.drop('Converted', axis = 1)
y = leads['Converted']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.8, random_state = 42)

In [6]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family=sm.families.Binomial())
logm1.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,1076.0
Model:,GLM,Df Residuals:,1047.0
Model Family:,Binomial,Df Model:,28.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-429.81
Date:,"Thu, 28 May 2020",Deviance:,859.63
Time:,18:26:54,Pearson chi2:,1010.0
No. Iterations:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.5122,1.207,-3.738,0.000,-6.878,-2.146
TotalVisits,0.2225,0.111,2.007,0.045,0.005,0.440
Total Time Spent on Website,0.0021,0.000,12.788,0.000,0.002,0.002
Page Views Per Visit,-0.0678,0.112,-0.606,0.544,-0.287,0.151
Lead Origin_Landing Page Submission,-0.6907,0.285,-2.425,0.015,-1.249,-0.132
Lead Origin_Lead Add Form,2.7885,1.332,2.093,0.036,0.177,5.400
Lead Origin_Quick Add Form,-1.424e-12,5.73e-08,-2.49e-05,1.000,-1.12e-07,1.12e-07
Lead Source_Blog,7.003e-11,1.59e-07,0.000,1.000,-3.11e-07,3.11e-07
Lead Source_Click2call,-6.967e-11,1.51e-07,-0.000,1.000,-2.95e-07,2.95e-07


In [7]:
logreg = LogisticRegression()

In [8]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 20)
rfe = rfe.fit(X_train,y_train)

In [9]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('TotalVisits', True, 1),
 ('Total Time Spent on Website', False, 8),
 ('Page Views Per Visit', False, 7),
 ('Lead Origin_Landing Page Submission', True, 1),
 ('Lead Origin_Lead Add Form', True, 1),
 ('Lead Origin_Quick Add Form', False, 18),
 ('Lead Source_Blog', False, 17),
 ('Lead Source_Click2call', False, 16),
 ('Lead Source_Direct traffic', True, 1),
 ('Lead Source_Google', False, 2),
 ('Lead Source_NaN', False, 21),
 ('Lead Source_Olark chat', True, 1),
 ('Lead Source_Organic search', True, 1),
 ('Lead Source_Pay per click ads', False, 15),
 ('Lead Source_Referral sites', True, 1),
 ('Lead Source_Social media', False, 12),
 ('Lead Source_Testone', False, 11),
 ('Lead Source_Welearn', False, 19),
 ('Lead Source_Welearnblog_home', False, 10),
 ('Lead Source_Welingak website', True, 1),
 ('Lead Source_Youtubechannel', False, 4),
 ('Do Not Email_Yes', True, 1),
 ('Country_Other', True, 1),
 ('Search_Yes', False, 20),
 ('Digital Advertisement_Yes', False, 13),
 ('Through Recommendat

In [10]:
col = X_train.columns[rfe.support_]

In [11]:
X_train.columns[~rfe.support_] 

Index(['Total Time Spent on Website', 'Page Views Per Visit',
       'Lead Origin_Quick Add Form', 'Lead Source_Blog',
       'Lead Source_Click2call', 'Lead Source_Google', 'Lead Source_NaN',
       'Lead Source_Pay per click ads', 'Lead Source_Social media',
       'Lead Source_Testone', 'Lead Source_Welearn',
       'Lead Source_Welearnblog_home', 'Lead Source_Youtubechannel',
       'Search_Yes', 'Digital Advertisement_Yes',
       'Through Recommendations_Yes', 'City_Other Cities',
       'City_Other Metro Cities', 'Asymmetrique Activity Index_02.Medium',
       'A free copy of Mastering The Interview_Yes'],
      dtype='object')

In [12]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,1076.0
Model:,GLM,Df Residuals:,1055.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-532.23
Date:,"Thu, 28 May 2020",Deviance:,1064.5
Time:,18:26:54,Pearson chi2:,1350.0
No. Iterations:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.8135,0.274,-2.965,0.003,-1.351,-0.276
TotalVisits,0.1218,0.069,1.756,0.079,-0.014,0.258
Lead Origin_Landing Page Submission,-0.6376,0.216,-2.953,0.003,-1.061,-0.214
Lead Origin_Lead Add Form,2.1616,0.867,2.493,0.013,0.462,3.861
Lead Source_Direct traffic,0.0956,0.190,0.504,0.614,-0.276,0.467
Lead Source_Olark chat,-0.4781,0.514,-0.930,0.352,-1.485,0.529
Lead Source_Organic search,0.3173,0.239,1.327,0.185,-0.151,0.786
Lead Source_Referral sites,-0.9475,0.651,-1.456,0.145,-2.223,0.328
Lead Source_Welingak website,20.0023,1.68e+04,0.001,0.999,-3.3e+04,3.3e+04


In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [14]:
#VIF looks ok
vif = pd.DataFrame()
vif['Columns'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Columns,VIF
1,Lead Origin_Landing Page Submission,5.31
0,TotalVisits,4.83
3,Lead Source_Direct traffic,2.38
18,Asymmetrique Profile Index_02.Medium,1.54
2,Lead Origin_Lead Add Form,1.5
9,Country_Other,1.44
11,Lead Quality_Might be,1.37
5,Lead Source_Organic search,1.32
12,Lead Quality_Not Sure,1.28
15,City_Thane & Outskirts,1.2


In [15]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
predicted = model2.predict(X_test)
print(predicted)

[0 1 0 ... 0 0 1]


In [17]:
probs = model2.predict_proba(X_test)
print(probs)

[[0.93284354 0.06715646]
 [0.12959297 0.87040703]
 [0.84097015 0.15902985]
 ...
 [0.80812562 0.19187438]
 [0.6413846  0.3586154 ]
 [0.20176103 0.79823897]]


In [18]:
print(metrics.accuracy_score(y_test, predicted))
print(metrics.roc_auc_score(y_test, probs[:, 1]))

0.8015320334261838
0.8724860144232286


In [19]:
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[2389  366]
 [ 489 1064]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      2755
           1       0.74      0.69      0.71      1553

    accuracy                           0.80      4308
   macro avg       0.79      0.78      0.78      4308
weighted avg       0.80      0.80      0.80      4308

