### Importing the necessary packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.set_option('display.max_rows',150)
pd.set_option('display.max_columns',150)

### Reading the data

In [None]:
lead = pd.read_csv('../input/leadscore/Leads.csv')
lead.head()

### Shape of the dataframe

In [None]:
lead.shape

### Dataframe Information

In [None]:
lead.info()

### Treating incorrect datatype

In [None]:
# 'Converted' is a binary categorical variable but with datatype as 'int64'
lead['Converted'] = lead['Converted'].astype('category')

In [None]:
lead['Converted'].dtype

### Checking for the missing values

#### A few categorical features have a label - 'Select', which is the default option of selecting the value of a feature and this means a lead has not selected any value; and hence we are good to treat this as a missing value

In [None]:
# Replacing 'Select' with NaN value
lead.replace({'Select' : np.nan},inplace=True)

In [None]:
# Validating whether all the 'Select' values got replaced or not
res = lead.isin(['Select']).any().any()
print(res)

In [None]:
# Checking the percentage of missing values
round(100*(lead.isnull().sum()/len(lead.index)), 2).sort_values(ascending=False)

In [None]:
# Dropping the columns where the missing value percentage is greater than 45
max_null_percentage = 45
lead = lead.loc[:, ((lead.isnull().sum() * 100 / len(lead)) < max_null_percentage)]
lead.shape

In [None]:
# Let us analyse the 'Country' column

lead['Country'].value_counts(normalize=True)*100

In [None]:
lead.drop(['Country'],axis=1, inplace=True)

In [None]:
# Now let us treat 'City'

lead['City'].value_counts(normalize=True)*100

In [None]:
# Replacing all the missing values for 'City' with 'Other Cities'

lead['City'].fillna('Other Cities',inplace=True)
lead['City'].value_counts(normalize=True)*100

In [None]:
# 'Specialization' has around 36.5% of missing values.

lead['Specialization'].value_counts(normalize=True)*100

In [None]:
lead['Specialization'].fillna('No Specialization',inplace=True)
lead['Specialization'].value_counts(normalize=True)*100

In [None]:
# The column 'Tags' has a little over 36% missing values.

lead['Tags'].nunique()

In [None]:
# Dropping the column 'Tags'

lead.drop(['Tags'],axis=1,inplace=True)
lead.shape

In [None]:
# Missing Values for 'What matters most to you in choosing a course'
lead['What matters most to you in choosing a course'].value_counts(normalize=True)*100

In [None]:
# Dropping the column 'What matters most to you in choosing a course'

lead.drop(['What matters most to you in choosing a course'],axis=1,inplace=True)
lead.shape

In [None]:
# Missing Values for 'What is your current occupation'
lead['What is your current occupation'].value_counts(normalize=True)*100

In [None]:
lead['What is your current occupation'].fillna('Unknown',inplace=True)
lead['What is your current occupation'].value_counts(normalize=True)*100

In [None]:
# Let us again check the percentage of missing values in the dataset

round(100*(lead.isnull().sum()/len(lead.index)), 2).sort_values(ascending=False)

In [None]:
lead = lead.dropna(axis=0, subset=['TotalVisits','Page Views Per Visit','Last Activity','Lead Source'])

In [None]:
# Let us re-check the percentage of missing values in the dataset

round(100*(lead.isnull().sum()/len(lead.index)), 2).sort_values(ascending=False)

In [None]:
lead.shape #checking shape again

### Incorrect Label Treatment for the categorical columns

In [None]:
columns = lead.dtypes[lead.dtypes == 'object'].index.values
for col in columns : 
    print('Levels in ',col,' are ' , lead[col].unique(),'\n\n')

In [None]:
# Replacing 'google' with 'Google' in 'Lead Source'
lead['Lead Source'] = lead['Lead Source'].str.replace('google','Google')

In [None]:
# Dropping columns having only one label
col_label_drop = ['Magazine','Receive More Updates About Our Courses','Update me on Supply Chain Content','Get updates on DM Content',
                 'I agree to pay the amount through cheque']
lead.drop(columns=col_label_drop,axis=1,inplace=True)

In [None]:
lead.shape

In [None]:
lead['Last Activity'].value_counts(normalize=True)*100

In [None]:
lead['Last Notable Activity'].value_counts(normalize=True)*100

In [None]:
# Dropping the column - 'Last Notable Activity'

lead.drop(columns = ['Last Notable Activity'], inplace=True)

In [None]:
lead.shape

In [None]:
lead['Newspaper'].value_counts(normalize=True)*100

In [None]:
lead['Newspaper Article'].value_counts(normalize=True)*100

In [None]:
# Dropping the column - 'Newspaper Article'

lead.drop(columns = ['Newspaper Article'], inplace=True)

In [None]:
lead.shape

In [None]:
# Let us check the labels for 'Lead Origin'

lead['Lead Origin'].value_counts(normalize=True)*100

In [None]:
# Labeling those lead origins having very low lead counts as 'Others' 

lead_origin = lead['Lead Origin'].value_counts(normalize=True)
low_lead_origin = lead_origin[lead_origin < 0.07].index

lead['Lead Origin'].replace(low_lead_origin,'Others',inplace=True)
lead['Lead Origin'].value_counts(normalize=True)*100

In [None]:
# Let us check the labels for 'Lead Source'

lead['Lead Source'].value_counts(normalize=True)*100

In [None]:
lead_source = lead['Lead Source'].value_counts(normalize=True)
low_lead_source = lead_source[lead_source < 0.1].index   

lead['Lead Source'].replace(low_lead_source,'Others',inplace=True)
lead['Lead Source'].value_counts(normalize=True)*100

In [None]:
# Checking the labels for 'Last Activity'

lead['Last Activity'].value_counts(normalize=True)*100

In [None]:
lead_activity = lead['Last Activity'].value_counts(normalize=True)
low_lead_activity = lead_activity[lead_activity < 0.1].index    

lead['Last Activity'].replace(low_lead_activity,'Others',inplace=True)
lead['Last Activity'].value_counts(normalize=True)*100

In [None]:
# Checking the labels for 'Specialization'

lead['Specialization'].value_counts(normalize=True)*100

In [None]:
lead_spec = lead['Specialization'].value_counts(normalize=True)
low_lead_spec = lead_spec[lead_spec < 0.05].index               

lead['Specialization'].replace(low_lead_spec,'Others',inplace=True)
lead['Specialization'].value_counts(normalize=True)*100

### Outlier Analysis

In [None]:
# Checking for the outliers for the continuous variables at 25%, 50%, 75%, 90%, 95% and 99%

lead[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']].describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:
# Let us cap the outliers at the 99th percentile

cap1 = lead['TotalVisits'].quantile(0.99)
cap2 = lead['Total Time Spent on Website'].quantile(0.99)
cap3 = lead['Page Views Per Visit'].quantile(0.99)

cond1 = lead['TotalVisits'] > cap1
cond2 = lead['Total Time Spent on Website'] > cap2
cond3 = lead['Page Views Per Visit'] > cap3

lead.loc[cond1,'TotalVisits' ] = cap1
lead.loc[cond1,'Total Time Spent on Website' ] = cap2
lead.loc[cond1,'Page Views Per Visit' ] = cap3

In [None]:
# Re-checking for the outliers for the continuous variables at 25%, 50%, 75%, 90%, 95% and 99%

lead[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']].describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:
lead.shape

#### Total % of records retained = (Total no. of records after data cleaning / Total no. of records in the original data) * 100
                               = (9074 / 9240) * 100
                               = 98.2 %

### Checking the balancing nature of the data

In [None]:
lead['Converted'].value_counts(normalize=True)*100

In [None]:
# Dividing the dataset w.r.t. successful or unsuccessful convertion of leads
conv = lead[lead['Converted']==1]
unconv = lead[lead['Converted']==0]

### Analysis of the features

In [None]:
def plot_bar(column_name, title_name, df1=lead, df2=conv, figsize=(7,4), kind='bar', normalize_df_appl=False, k=1):
    plt.figure(figsize=figsize)
    plt.subplot(1,2,1)
    (df1[column_name].value_counts(normalize=normalize_df_appl)*k).plot(kind=kind)
    plt.title('Total Leads by {0}'.format(title_name), fontsize=12)
    
    plt.subplot(1,2,2)
    (df2[column_name].value_counts(normalize=True)*100).plot(kind=kind)
    plt.title('Converted by {0} (%)'.format(title_name), fontsize=12)
    
    plt.show()

In [None]:
# Univariate Analysis for 'Lead Origin'

plot_bar('Lead Origin','Lead Origin')

In [None]:
# Univariate Analysis for 'Lead Source'

plot_bar('Lead Source','Lead Source')

In [None]:
# Univariate Analysis for 'Do Not Email'

plot_bar('Do Not Email','Do Not Email')

In [None]:
# Univariate Analysis for 'Last Activity'

plot_bar('Last Activity','Last Activity')

In [None]:
# Univariate Analysis for 'Specialization'

plot_bar('Specialization','Specialization')

In [None]:
# Univariate Analysis for 'What is your current occupation'

plot_bar('What is your current occupation','current occupation')

In [None]:
# Univariate Analysis for 'A free copy of Mastering The Interview'

plot_bar('A free copy of Mastering The Interview','Mastering The Interview')

In [None]:
# Defining function for continuous variable univariate analysis using disribution plot
def plot_distplot(column_name, title_name):
    plt.figure(figsize=(15,5))
    plt.title('{0} - All leads vs Converted'.format(title_name), fontsize=12)
    sns.distplot(lead[column_name],hist=False,label='All leads')
    sns.distplot(conv[column_name],hist=False,label='Converted')
    plt.show()

In [None]:
# Univariate Analysis for 'TotalVisits'
plot_distplot('TotalVisits','TotalVisits')

In [None]:
# Univariate Analysis for 'Total Time Spent on Website'
plot_distplot('Total Time Spent on Website','Total Time Spent on Website')

In [None]:
# Univariate Analysis for 'Page Views Per Visit '
plot_distplot('Page Views Per Visit','Page Views Per Visit')

## Data Preparation

### Variable Mapping & Creating Dummy Variables

In [None]:
# Let us check once more the labels of all the categorical features

columns = lead.dtypes[lead.dtypes == 'object'].index.values
for col in columns : 
    print('Levels in ',col,' are ' , lead[col].unique(),'\n\n')

In [None]:
# Mapping the binary categorical variables(yes/no) to 1/0

varlist =  ['Do Not Email', 'Do Not Call', 'Search', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'A free copy of Mastering The Interview']

# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function to the housing list
lead[varlist] = lead[varlist].apply(binary_map)

In [None]:
lead.head()

In [None]:
# Creating dummy variables for the remaining categorical features and removing the 'Others' category since it is 
# not very intuitive from model understanding perspective & adding them to the original dataframe

dummy1 = pd.get_dummies(lead['Lead Origin'],prefix='Origin')      
dummy1 = dummy1.drop(columns=['Origin_Others'])
lead = pd.concat([lead,dummy1], axis=1)

dummy2 = pd.get_dummies(lead['Lead Source'],prefix='Source')
dummy2 = dummy2.drop(columns='Source_Others')
lead = pd.concat([lead,dummy2], axis=1)

dummy3 = pd.get_dummies(lead['Last Activity'],prefix='Activity')
dummy3 = dummy3.drop(columns='Activity_Others')
lead = pd.concat([lead,dummy3], axis=1)

dummy4 = pd.get_dummies(lead['Specialization'],prefix='Spec')
dummy4 = dummy4.drop(columns='Spec_Others')
lead = pd.concat([lead,dummy4], axis=1)

dummy5 = pd.get_dummies(lead['What is your current occupation'],drop_first=True,prefix='Occupation')
lead = pd.concat([lead,dummy5], axis=1)

dummy6 = pd.get_dummies(lead['City'],drop_first=True,prefix='City')
lead = pd.concat([lead,dummy6], axis=1)

In [None]:
lead.head()

In [None]:
# Dropping the features whose dummy variables have been created

lead = lead.drop(['Lead Origin','Lead Source','Last Activity','Specialization','What is your current occupation','City'],1)

In [None]:
lead.info()

In [None]:
lead.shape

### Correlation

In [None]:
# Top Correlations
def correlation(dataframe) : 
    cor0=dataframe.corr()
    type(cor0)
    cor0.where(np.triu(np.ones(cor0.shape),k=1).astype(np.bool))
    cor0=cor0.unstack().reset_index()
    cor0.columns=['VAR1','VAR2','CORR']
    cor0.dropna(subset=['CORR'], inplace=True)
    cor0.CORR=round(cor0['CORR'],2)
    cor0.CORR=cor0.CORR.abs()
    cor0.sort_values(by=['CORR'],ascending=False)
    cor0=cor0[~(cor0['VAR1']==cor0['VAR2'])]
    return pd.DataFrame(cor0.sort_values(by=['CORR'],ascending=False))

In [None]:
#Correlations for Converted Leads 

convertedCondition= lead['Converted']==1
correlation(lead[convertedCondition])[1:30:2].style.hide_index()

In [None]:
#Correlations for Unconverted Leads 

unconvertedCondition= lead['Converted']==0
correlation(lead[unconvertedCondition])[1:30:2].style.hide_index()

In [None]:
# Dividing the dataset into 2 seperate dataframes - one with 'Prospect ID' and 'Lead Number', to be used for the eventual 
# lead scoring purpose, and another without those 2 features for model building purpose

lead_org = lead.copy()
lead_org = lead_org.drop(lead_org.iloc[:,2:],axis=1) 
lead = lead.drop(['Prospect ID','Lead Number'],axis=1)

### Test-Train Split

In [None]:
X = lead
y = lead.pop('Converted')

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=100)

### Feature Scaling

In [None]:
scaler = StandardScaler()

# fitting and transforming train set
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables

X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

### Feature Selction Using RFE

In [None]:
len(X_train.columns)

In [None]:
logreg = LogisticRegression()

In [None]:
rfe = RFE(logreg, 15)             
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# top 15 Columns selected by rfe

col = X_train.columns[rfe.support_]
col

### Building model using statsmodel, for the detailed statistics

#### Model 1

In [None]:
# Creating the first dataframe model with RFE selected variables
X_train_1 = X_train[col]

In [None]:
# Adding a constant variable
X_train_1 = sm.add_constant(X_train_1)

In [None]:
# Running the logistic model

lgm = sm.GLM(y_train,X_train_1,family = sm.families.Binomial()).fit()

In [None]:
# Summary of the new model
print(lgm.summary())

In [None]:
# Dropping the const variable

X_train_1_ = X_train_1.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_1_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

We will be following the below rule to eliminate the features one by one, as per the priorities mentioned by the following sequence:-

* We will first check the summary and VIF
* If a variable has got high p-value(>0.05) as well as high VIF(>5), we need to drop that first
* If a variable has got high p-value(>0.05) but low VIF(<5), then we need to drop such
* Still if we have a variable with low p-value(<0.05) but high VIF(>5), we need to drop such at the very end

#### Model 2

In [None]:
# Rebuilding the model without 'Newspaper'  

X_train_2 = X_train_1.drop(['const','Newspaper'], axis=1)

In [None]:
# Adding a constant variable
X_train_2 = sm.add_constant(X_train_2)

In [None]:
# Running the logistic model

lgm = sm.GLM(y_train,X_train_2,family = sm.families.Binomial()).fit()

In [None]:
# Summary of the new model
print(lgm.summary())

In [None]:
# Dropping the const variable

X_train_2_ = X_train_2.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_2_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Model 3

In [None]:
# Rebuilding the model without 'Occupation_Housewife'

X_train_3 = X_train_2.drop(['const','Occupation_Housewife'], axis=1)

In [None]:
# Adding a constant variable
X_train_3 = sm.add_constant(X_train_3)

In [None]:
# Running the logistic model

lgm = sm.GLM(y_train,X_train_3,family = sm.families.Binomial()).fit()

In [None]:
# Summary of the new model
print(lgm.summary())

In [None]:
# Dropping the const variable

X_train_3_ = X_train_3.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_3_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Model 4

In [None]:
# Rebuilding the model without 'City_Tier II Cities'

X_train_4 = X_train_3.drop(['const','City_Tier II Cities'], axis=1)

In [None]:
# Adding a constant variable
X_train_4 = sm.add_constant(X_train_4)

In [None]:
# Running the logistic model

lgm = sm.GLM(y_train,X_train_4,family = sm.families.Binomial()).fit()

In [None]:
# Summary of the new model
print(lgm.summary())

In [None]:
# Dropping the const variable

X_train_4_ = X_train_4.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_4_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Model 5

In [None]:
# Rebuilding the model without 'Occupation_Unemployed'

X_train_5 = X_train_4.drop(['const','Occupation_Unemployed'], axis=1)

In [None]:
# Adding a constant variable
X_train_5 = sm.add_constant(X_train_5)

In [None]:
# Running the logistic model

lgm = sm.GLM(y_train,X_train_5,family = sm.families.Binomial()).fit()

In [None]:
# Summary of the new model
print(lgm.summary())

In [None]:
# Dropping the const variable

X_train_5_ = X_train_5.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_5_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Model 6

In [None]:
# Rebuilding the model without 'Origin_API'

X_train_6 = X_train_5.drop(['const','Origin_API'], axis=1)

In [None]:
# Adding a constant variable
X_train_6 = sm.add_constant(X_train_6)

In [None]:
# Running the logistic model

lgm = sm.GLM(y_train,X_train_6,family = sm.families.Binomial()).fit()

In [None]:
# Summary of the new model
print(lgm.summary())

In [None]:
# Dropping the const variable

X_train_6_ = X_train_6.drop(['const'], axis=1)

In [None]:
# Calculating the VIFs for the new model

vif = pd.DataFrame()
X = X_train_6_
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Final Features

Final_cols = X_train_6.columns.values
Final_col = np.delete(Final_cols,0)   # Removing the 'const' variable for future use
Final_col

### Prediction

In [None]:
lgm = sm.GLM(y_train,X_train_6,family = sm.families.Binomial()).fit()  #as obtained previously
y_train_pred = lgm.predict(X_train_6)

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conv_Prob':y_train_pred})
y_train_pred_final['LeadID'] = y_train.index
y_train_pred_final.head()

In [None]:
# Creating new column 'Lead_Pred' with 1 if Conv_Prob > 0.5 else 0

y_train_pred_final['Lead_Pred'] = y_train_pred_final['Conv_Prob'].map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

#### Confusion Matrix for Train Set

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final['Converted'], y_train_pred_final['Lead_Pred'])
print(confusion)

#### Accuracy for Train Model

In [None]:
accuracy = metrics.accuracy_score(y_train_pred_final['Converted'], y_train_pred_final['Lead_Pred'])
print('Accuracy on Train Data : ', round(100*accuracy,2),'%')

#### Metrics beyond simple accuracy

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives
sensitivity = TP/(FN + TP)
specificity = TN/(FP + TN)
falsepositiverate = FP/(FP + TN)
positivepredictivevalue = TP/(TP +FP )
negativepredictivevalue = TN/(TN + FN)
print('Sensitivity : ', round(100*sensitivity,2),'%')
print('Specificity : ',  round(100*specificity,2),'%')
print('False Positive Rate : ',  round(100*falsepositiverate,2),'%')
print('Positive Predictive Power : ',  round(100*positivepredictivevalue,2),'%')
print('Negative Predictive Power : ',  round(100*negativepredictivevalue,2),'%')

#### Plotting ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final['Converted'], y_train_pred_final['Conv_Prob'], drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final['Converted'], y_train_pred_final['Conv_Prob'])

#### Finding Optimal Cutoff Point

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final['Conv_Prob'].map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final['Converted'], y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final['Conv_Prob'].map( lambda x: 1 if x > 0.38 else 0)

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
accu = metrics.accuracy_score(y_train_pred_final['Converted'], y_train_pred_final['final_predicted'])
print('Accuracy on Train set at Optimum Cut Off : ', round(100*accu,2),'%')

In [None]:
# Confusion Matrix for the train set at the Optimum Cut-Off

confusion2 = metrics.confusion_matrix(y_train_pred_final['Converted'], y_train_pred_final['final_predicted'])
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives
sensitivity = TP/(FN + TP)
specificity = TN/(FP + TN)
falsepositiverate = FP/(FP + TN)
positivepredictivevalue = TP/(TP +FP )
negativepredictivevalue = TN/(TN + FN)
print('Sensitivity : ', round(100*sensitivity,2),'%')
print('Specificity : ',  round(100*specificity,2),'%')
print('False Positive Rate : ',  round(100*falsepositiverate,2),'%')
print('Positive Predictive Power : ',  round(100*positivepredictivevalue,2),'%')
print('Negative Predictive Power : ',  round(100*negativepredictivevalue,2),'%')

In [None]:
# ROC curve for cut off probability of 0.38
draw_roc(y_train_pred_final['Converted'], y_train_pred_final['final_predicted'])

#### Precision and Recall

In [None]:
precision_score(y_train_pred_final['Converted'], y_train_pred_final['Lead_Pred'])

In [None]:
recall_score(y_train_pred_final['Converted'], y_train_pred_final['Lead_Pred'])

#### Precision and Recall tradeoff

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final['Converted'], y_train_pred_final['Conv_Prob'])
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

### Predictions on the Test Data

In [None]:
# Fit and transform operations are done on the training data but only transform operation will be done on the test data

X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.transform(X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

In [None]:
# Aligning X_test with the final features

X_test = X_test[Final_col]

In [None]:
X_test_sm = sm.add_constant(X_test)

In [None]:
y_test_pred = lgm.predict(X_test_sm)

In [None]:
y_test_pred_final = pd.DataFrame({'Converted':y_test, 'Conv_Prob':y_test_pred, 'LeadID':y_test.index})
y_test_pred_final.head()

In [None]:
# Making predictions with optimal cut off = 0.38

y_test_pred_final['final_predicted'] = y_test_pred_final['Conv_Prob'].map(lambda x: 1 if x > 0.38 else 0)

In [None]:
y_test_pred_final.head()

#### Accuracy of the test data

In [None]:
accuracy = metrics.accuracy_score(y_test_pred_final['Converted'], y_test_pred_final['final_predicted'])
print('Accuracy on Test Data : ', round(100*accuracy,2),'%')

#### Confusion Matrix for Test Data

In [None]:
confusion3 = metrics.confusion_matrix(y_test_pred_final['Converted'], y_test_pred_final['final_predicted'])
print(confusion3)

In [None]:
TP = confusion3[1,1] # true positive 
TN = confusion3[0,0] # true negatives
FP = confusion3[0,1] # false positives
FN = confusion3[1,0] # false negatives
sensitivity = TP/(FN + TP)
specificity = TN/(FP + TN)
falsepositiverate = FP/(FP + TN)
positivepredictivevalue = TP/(TP +FP )
negativepredictivevalue = TN/(TN + FN)
print('Sensitivity : ', round(100*sensitivity,2),'%')
print('Specificity : ',  round(100*specificity,2),'%')
print('False Positive Rate : ',  round(100*falsepositiverate,2),'%')
print('Positive Predictive Power : ',  round(100*positivepredictivevalue,2),'%')
print('Negative Predictive Power : ',  round(100*negativepredictivevalue,2),'%')

#### ROC Curve

In [None]:
draw_roc(y_test_pred_final['Converted'], y_test_pred_final['final_predicted'])

### Lead Scoring

In [None]:
conversionprob = pd.concat([y_test_pred_final['Conv_Prob'],y_train_pred_final['Conv_Prob']],axis=0)
convprob = pd.DataFrame(conversionprob)  #creating a dataframe instead of a pandas series
convprob.reset_index(inplace=True)
convprob.head(3)

In [None]:
lead_org.reset_index(inplace=True)
lead_org.head(3)

In [None]:
leads = pd.merge(lead_org,convprob,on='index',how='outer')
leads['Lead Score'] = round(leads['Conv_Prob']*100,2)

In [None]:
leads.sort_values('Lead Score',ascending=False).head(10)

### Interpretation

We have arrived at a very decent model for the the convertion of leads using 10 variables. It can predict the potential leads (leads who eventually get converted) correctly upto 78%. We also have the corresponding lead scores of conversion.

The final relationship between log Odds of Conversion Probability and lead features is    
  
`logOdds(Conversion Probability)` = 0.3199 - 1.2623 * `Do Not Email` + 0.8939 `Total Time Spent on Website` - 1.893 `Origin_Landing Page Submission` + 0.3538 `Source_Olark Chat` + 0.6283 `Activity_Email Opened` - 0.9875 `Activity_Olark Chat Conversation` + 1.8478 `Activity_SMS Sent` - 1.1953 `Spec_No Specialization` - 1.3795 `Occupation_Unknown` + 2.3786 `Occupation_Working Professional`
  
where `Total Time Spent on Website` is standardized to $\mu=0,\sigma=1$


Interpreting the top features affecting Conversion Probability :   
- Leads who are `Working Professionals` have 2.38 times higher log odds of conversion than those with other professions (such as student, housewife, unemployed etc.) combined
- Leads having `SMS Sent` as the last registered activity have 1.8 times higher log odds of conversion than leads having all other last activity (such as email opened, page visited etc.) combined 
- Leads having `Email Opened` as the last registered activity have 0.6 times higher log odds of conversion than leads having all other last activity (such as email marked spam, email link opened, page visited on website etc.) combined 
- Leads who landed on the company’s website through `Olark Chat` as the source, have 0.35 times higher log odds of conversion compared to other lead sources (such as google, facebook etc.) combined
- Leads whose origin was a `Landing Page Submission` have 1.9 times lesser log odds of conversion than other lead sources (such as API etc.) combined
- Leads who have not provided details of their Occupation or in other words, have `Unknown Occupation` have 1.37 times lower log odds of conversion compared to all the leads who have filled their occupation (as either unemployed, working etc.) combined
- Leads who chose not to receive email updates (`Do Not Email`) have 1.26 times lower log odds of conversion compared to leads who would like email updates
- Leads with `No Specialization` have 1.2 times lower log odds of conversion than all leads having some specialization (either Banking, Healthcare, Finance etc.) combined
- Leads with `Olark Chat conversation` as the last activity registered have 0.99 times lower log odds of conversion compared to leads having all other last activity (such as email opened, sms sent etc.) combined
    

### Business Goals/ Action to be taken

* The sales team should target those leads on high priority, who are working professionals, who spend significant amount of time on the website, wishes to communicate over mail and with whom the last method of contact was SMS sent

* The sales team should provide low importance to those leads, who have not mentioned either of their occupation or specialization, and do not wishes to communicate over mail