In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
leads = pd.read_csv("C:/Users/PaulB/OneDrive/Desktop/First Capstone/clean_leads.csv", index_col = 'Lead Number')

In [3]:
leads.head()

Unnamed: 0_level_0,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Search,Newspaper Article,X Education Forums,Newspaper,...,Tags_Still Thinking,Tags_University not recognized,Tags_Want to take admission but has financial problems,Tags_Will revert after reading the email,Tags_in touch with EINS,Tags_invalid number,Tags_number not provided,Tags_opp hangup,Tags_switched off,Tags_wrong number given
Lead Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
660737,0,0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660728,0,0,0,5.0,674,2.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660727,0,0,1,2.0,1532,2.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
660719,0,0,0,1.0,305,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
660681,0,0,1,2.0,1428,1.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [4]:
#Splitting training and testing data
X= leads.drop('Converted', axis = 1)
y = leads['Converted']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.8, random_state = 42)

In [5]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family=sm.families.Binomial())
logm1.fit().summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,1721.0
Model:,GLM,Df Residuals:,1609.0
Model Family:,Binomial,Df Model:,111.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,
Date:,"Sun, 07 Jun 2020",Deviance:,
Time:,17:34:14,Pearson chi2:,5.36e+17
No. Iterations:,100,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.969e+14,2.12e+07,1.87e+07,0.000,3.97e+14,3.97e+14
Do Not Email,-5.861e+14,7.17e+06,-8.18e+07,0.000,-5.86e+14,-5.86e+14
Do Not Call,1.306e+04,0.000,1.13e+08,0.000,1.31e+04,1.31e+04
TotalVisits,-1.827e+13,1.54e+06,-1.18e+07,0.000,-1.83e+13,-1.83e+13
Total Time Spent on Website,2.859e+11,3780.418,7.56e+07,0.000,2.86e+11,2.86e+11
Page Views Per Visit,5.921e+12,1.82e+06,3.25e+06,0.000,5.92e+12,5.92e+12
Search,2.813e+15,4.06e+07,6.92e+07,0.000,2.81e+15,2.81e+15
Newspaper Article,-920.8391,8.07e-06,-1.14e+08,0.000,-920.839,-920.839
X Education Forums,495.2727,4.51e-06,1.1e+08,0.000,495.273,495.273


In [6]:
logreg = LogisticRegression()

In [7]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 20)
rfe = rfe.fit(X_train,y_train)

In [8]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('Do Not Email', True, 1),
 ('Do Not Call', False, 106),
 ('TotalVisits', False, 73),
 ('Total Time Spent on Website', False, 95),
 ('Page Views Per Visit', False, 86),
 ('Search', False, 5),
 ('Newspaper Article', False, 98),
 ('X Education Forums', False, 97),
 ('Newspaper', False, 96),
 ('Digital Advertisement', False, 105),
 ('Through Recommendations', False, 99),
 ('A free copy of Mastering The Interview', False, 54),
 ('Lead Origin_Landing Page Submission', False, 63),
 ('Lead Origin_Quick Add Form', False, 7),
 ('Lead Source_Direct Traffic', False, 38),
 ('Lead Source_Facebook', False, 55),
 ('Lead Source_Google', False, 40),
 ('Lead Source_Live Chat', False, 109),
 ('Lead Source_NC_EDM', False, 108),
 ('Lead Source_Olark Chat', False, 34),
 ('Lead Source_Organic Search', False, 39),
 ('Lead Source_Pay per Click Ads', False, 120),
 ('Lead Source_Press_Release', False, 117),
 ('Lead Source_Reference', False, 43),
 ('Lead Source_Referral Sites', False, 37),
 ('Lead Source_Social 

In [9]:
col = X_train.columns[rfe.support_]

In [10]:
X_train.columns[~rfe.support_] 

Index(['Do Not Call', 'TotalVisits', 'Total Time Spent on Website',
       'Page Views Per Visit', 'Search', 'Newspaper Article',
       'X Education Forums', 'Newspaper', 'Digital Advertisement',
       'Through Recommendations',
       ...
       'Tags_Lost to Others', 'Tags_Recognition issue (DEC approval)',
       'Tags_Shall take in the next coming month', 'Tags_Still Thinking',
       'Tags_University not recognized',
       'Tags_Want to take admission but has financial problems',
       'Tags_in touch with EINS', 'Tags_number not provided',
       'Tags_opp hangup', 'Tags_wrong number given'],
      dtype='object', length=120)

In [11]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,1721.0
Model:,GLM,Df Residuals:,1700.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-311.86
Date:,"Sun, 07 Jun 2020",Deviance:,623.71
Time:,17:34:16,Pearson chi2:,1780.0
No. Iterations:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.3107,0.172,-13.449,0.000,-2.647,-1.974
Do Not Email,-2.0526,0.473,-4.342,0.000,-2.979,-1.126
Lead Source_Welingak Website,2.9595,1.122,2.637,0.008,0.760,5.159
Last Notable Activity_SMS Sent,2.7774,0.263,10.549,0.000,2.261,3.293
Asymmetrique Activity Score_10.0,-26.9533,7.48e+04,-0.000,1.000,-1.47e+05,1.47e+05
Asymmetrique Activity Score_11.0,-27.0631,8e+04,-0.000,1.000,-1.57e+05,1.57e+05
Asymmetrique Activity Score_13.0,-2.0299,0.542,-3.742,0.000,-3.093,-0.967
Asymmetrique Activity Score_17.0,-24.5368,4.33e+04,-0.001,1.000,-8.48e+04,8.48e+04
What is your current occupation_Unemployed,1.9250,0.233,8.257,0.000,1.468,2.382


In [12]:
col = col.drop(['Asymmetrique Activity Score_10.0','Asymmetrique Activity Score_11.0','Asymmetrique Activity Score_17.0'], 1)

In [13]:
col = col.drop(['Tags_Already a student','Tags_Graduation in progress','Tags_Lost to EINS'
                ,'Tags_Not doing further education','Tags_invalid number','Tags_switched off'], 1)

In [14]:
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

0,1,2,3
Dep. Variable:,Converted,No. Observations:,1721.0
Model:,GLM,Df Residuals:,1709.0
Model Family:,Binomial,Df Model:,11.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-471.43
Date:,"Sun, 07 Jun 2020",Deviance:,942.85
Time:,17:34:17,Pearson chi2:,1790.0
No. Iterations:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.0370,0.140,-14.595,0.000,-2.310,-1.763
Do Not Email,-1.4949,0.390,-3.833,0.000,-2.259,-0.730
Lead Source_Welingak Website,3.4917,1.077,3.243,0.001,1.381,5.602
Last Notable Activity_SMS Sent,2.1163,0.205,10.300,0.000,1.714,2.519
Asymmetrique Activity Score_13.0,-1.8323,0.449,-4.078,0.000,-2.713,-0.952
What is your current occupation_Unemployed,1.0333,0.181,5.714,0.000,0.679,1.388
What is your current occupation_Working Professional,2.1899,0.596,3.672,0.000,1.021,3.359
Lead Quality_Worst,-2.0171,0.573,-3.523,0.000,-3.139,-0.895
Tags_Closed by Horizzon,4.2937,0.743,5.780,0.000,2.838,5.750


In [15]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [16]:
#VIF looks ok
vif = pd.DataFrame()
vif['Columns'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Columns,VIF
4,What is your current occupation_Unemployed,2.9
10,Tags_Will revert after reading the email,2.17
5,What is your current occupation_Working Profes...,1.59
9,Tags_Ringing,1.57
2,Last Notable Activity_SMS Sent,1.48
7,Tags_Closed by Horizzon,1.2
6,Lead Quality_Worst,1.15
8,Tags_Interested in other courses,1.15
3,Asymmetrique Activity Score_13.0,1.11
0,Do Not Email,1.07


In [17]:
model2 = LogisticRegression()
model2.fit(X_train[col], y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
predicted_train = model2.predict(X_train[col])

In [19]:
print(metrics.accuracy_score(y_train, predicted_train))

0.8832074375363161


In [20]:
predicted = model2.predict(X_test[col])
print(predicted)

[1 0 0 ... 0 0 1]


In [21]:
probs = model2.predict_proba(X_test[col])
print(probs)

[[0.28610612 0.71389388]
 [0.87213933 0.12786067]
 [0.50671408 0.49328592]
 ...
 [0.87213933 0.12786067]
 [0.7268658  0.2731342 ]
 [0.28610612 0.71389388]]


In [22]:
print(metrics.accuracy_score(y_test, predicted))
print(metrics.roc_auc_score(y_test, probs[:, 1]))

0.8927119628339141
0.9458454466309926


In [23]:
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[4128  147]
 [ 592 2021]]
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      4275
           1       0.93      0.77      0.85      2613

    accuracy                           0.89      6888
   macro avg       0.90      0.87      0.88      6888
weighted avg       0.90      0.89      0.89      6888



In [24]:
from sklearn.model_selection import cross_val_score

clf = LogisticRegression(solver='lbfgs')

scores = cross_val_score(clf, X_train[col], y_train, cv=10)
print('Cross-Validation Accuracy Scores', scores)

scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

Cross-Validation Accuracy Scores [0.9132948  0.83815029 0.9017341  0.86627907 0.86046512 0.90697674
 0.87209302 0.90116279 0.84210526 0.90643275]


(0.838150289017341, 0.8808693946633397, 0.9132947976878613)

In [39]:
scores = pd.DataFrame({'Lead Score':(probs[:,1]*100).astype('int'), 'Predicted Outcome': predicted, 
                       'Actual Outcome':y_test})
scores.head()

Unnamed: 0_level_0,Lead Score,Predicted Outcome,Actual Outcome
Lead Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614229,71,1,0
608116,12,0,0
630773,49,0,0
640766,0,0,0
642590,12,0,0
