In [1]:
# Supress unnecessary warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing the NumPy and Pandas packages

import numpy as np
import pandas as pd

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

#import stats library
from scipy import stats
import statsmodels.api as sm

#import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import metrics
from sklearn.metrics import classification_report,recall_score,roc_auc_score,roc_curve,accuracy_score,precision_score,precision_recall_curve,confusion_matrix
from sklearn.preprocessing import LabelEncoder

#import miscellaneous libraries
pd.set_option("display.max_columns",None)
pd.set_option("display.max_colwidth",200)

### Importing the "Leads" Dataset

In [3]:
# Read the dataset
leads = pd.read_csv("Leads.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Leads.csv'

In [None]:
leads.head()

In [None]:
#Checking the Shape of dataset
leads.shape

In [None]:
# Inspecting the different columns in the dataset

leads.columns

In [None]:
# Checking the summary of the dataset
leads.describe()

In [None]:
# Checking the info to see the types of the feature variables and the null values present
leads.info()

As it seems that there are quite a few categorical variables present in this dataset for which we will need to create dummy variables. Also, there are a lot of null values present as well, so we will need to treat them accordingly.

## Step 1: Data Cleaning and Preparation

In [None]:
# Checking the number of missing values in each column
leads.isnull().sum().sort_values(ascending=False)

In [None]:
## Numerous columns exhibit a high number of missing values, rendering them unhelpful. With 9000 data points in our data frame, we confidently eliminate columns with over 3000 missing values as they are of no use to us.

In [None]:
# Droping all the columns in which greater than 
for c in leads.columns:
    if leads[c].isnull().sum()>3000:
        leads.drop(c, axis=1,inplace=True)

In [None]:
leads.isnull().sum().sort_values(ascending=False)

In [None]:
#checking value counts of "City" column
leads['City'].value_counts(dropna=False)

`Mumbai` has highest numbers of leads

As you might be able to interpret, the variable `City` won't be of any use in our analysis. So it's best that we drop it.

In [None]:
# dropping the "City" feature
leads.drop(['City'], axis = 1, inplace = True)

In [None]:
#checking value counts of "Country" column
leads['Country'].value_counts(dropna=False)

Highest number of leads from `INDIA`

In [None]:
# dropping the "Country" feature
leads.drop(['Country'], axis = 1, inplace = True)

In [None]:
#Now checking the percentage of missing values in each column

round(100*(leads.isnull().sum()/len(leads.index)), 2)


In [None]:
# Checking the number of null values again
leads.isnull().sum().sort_values(ascending=False)

### Visualizing the features with `Select` values

In [None]:
def countplot(x, fig):
    plt.subplot(2,2, fig)
    sns.countplot(leads[x])
    plt.title('Count across'+' '+ x, size = 16)
    plt.xlabel(x,size = 14)
    plt.xticks(rotation = 90)

plt.figure(figsize=(15,10))

countplot('How did you hear about X Education',1)
countplot('Lead Profile',2)
countplot('Specialization',3)



plt.tight_layout()


There are certain columns that contain a level named `'Select'`. This indicates that the student has not made a selection for that specific column, resulting in the display of 'Select'. These values are equivalent to missing values, and it is imperative that we determine the frequency of the 'Select' level in all columns where it appears.

In [None]:
# checking the value counts of all the columns

for c in leads:
    print(leads[c].astype('category').value_counts())
    print('___________________________________________________')

The following three columns now have the level 'Select'. Let's check them once again.

In [None]:
leads['Lead Profile'].astype('category').value_counts()

In [None]:
leads['How did you hear about X Education'].value_counts()

In [None]:
leads['Specialization'].value_counts()

### Visualizing the features

In [None]:
def countplot(x, fig):
    plt.subplot(4,2, fig)
    sns.countplot(leads[x])
    plt.title('Count across'+' '+ x, size = 16)
    plt.xlabel(x,size = 14)
    plt.xticks(rotation = 90)

plt.figure(figsize=(18,25))


countplot('What matters most to you in choosing a course',1)
countplot('What is your current occupation',2)
countplot('Specialization',3)

plt.tight_layout()

As it can be seen that the levels of `"Lead Profile"` and `"How did you hear about X Education"` have a lot of rows which have the value Select which is of no use to the analysis

So it's best that we drop them.

In [None]:
# dropping Lead Profile and How did you hear about X Education cols
leads.drop(['Lead Profile', 'How did you hear about X Education'], axis = 1, inplace = True)

We have also noticed that during value count of all the columns, there were few which has one value point present as majority and that is No. So we can drop these following coloums as well. Do Not Call, Search, Magazine, Newspaper Article, X Education Forums, Newspaper, Digital Advertisement, Through Recommendations, Receive More Updates About Our Courses, Update me on Supply Chain Content, Get updates on DM Content, I agree to pay the amount through cheque.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.pairplot(leads,diag_kind='kde',hue='Converted')
plt.show()

In [None]:
x_edu = leads[['TotalVisits','Total Time Spent on Website','Page Views Per Visit','Converted']]
sns.pairplot(x_edu,diag_kind='kde',hue='Converted')
plt.show()

In [None]:
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
transformedx_edu = pd.DataFrame(pt.fit_transform(x_edu))
transformedx_edu.columns = x_edu.columns
transformedx_edu.head()

In [None]:
sns.pairplot(transformedx_edu,diag_kind='kde',hue='Converted')
plt.show()

In [None]:
# Dropping the above columns

In [None]:
leads.drop(['Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 
            'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 
            'Update me on Supply Chain Content', 'Get updates on DM Content', 
            'I agree to pay the amount through cheque'], axis = 1, inplace = True)

In [None]:
leads['What matters most to you in choosing a course'].value_counts()

The variable `What matters most to you in choosing a course` has the `level Better Career Prospects` 6528 times while the other two levels appear once twice and once respectively. 

So we should dropping this column as well.

In [None]:
leads.drop(['What matters most to you in choosing a course'], axis = 1, inplace=True)

In [None]:
# Checking the number of null values again
leads.isnull().sum().sort_values(ascending=False)

The column `What is your current occupation` contains numerous null values. Although dropping the entire row is an option, we have already lost several feature variables and do not want to risk losing potentially significant data. Therefore, we will only drop the null rows for the `What is your current occupation` column.

In [None]:
# Dropping the null values rows in the column 'What is your current occupation'

leads = leads[~pd.isnull(leads['What is your current occupation'])]

In [None]:
# Observing Correlation
# figure size
plt.figure(figsize=(10,8))

# heatmap
sns.heatmap(leads.corr(), annot=True,cmap="BrBG", robust=True,linewidth=0.1, vmin=-1 )
plt.show()

### Analysing Categorical features

In [None]:
conv = leads.select_dtypes(include ="object").columns
for i in conv:
    
    plt.figure(figsize =(15,5))
    sns.countplot(leads[i], hue=leads.Converted)
    plt.xticks(rotation = 90)
    plt.title('Target variable in'+' '+ i)
    plt.xlabel(i)
    plt.show()

In [None]:
# Checking the number of null values again
leads.isnull().sum().sort_values(ascending=False)

Since now the number of null values present in the columns are quite small we can simply drop the rows in which these null values are present.

In [None]:
# Dropping the null values rows in the column 'TotalVisits'

leads = leads[~pd.isnull(leads['TotalVisits'])]

In [None]:
# Checking the number of null values again
leads.isnull().sum().sort_values(ascending=False)

In [None]:
# Dropping the null values rows in the column 'Lead Source'

leads = leads[~pd.isnull(leads['Lead Source'])]

In [None]:
# Checking the number of null values again
leads.isnull().sum().sort_values(ascending=False)

In [None]:
# Drop the null values rows in the column 'Specialization'

leads = leads[~pd.isnull(leads['Specialization'])]

In [None]:
# Checking the number of null values again
leads.isnull().sum().sort_values(ascending=False)

Now your data doesn't have any null values. Let's now check the percentage of rows that we have retained.

In [None]:
print(len(leads.index))
print(len(leads.index)/9240)

We still have around 69% of the rows which seems good enough.

In [None]:
# Let's look at the dataset again

leads.head()

Now, clearly the variables `Prospect ID` and `Lead Number` won't be of any use in the analysis, so it's best that we drop these two variables.

In [None]:
# Dropping the "Prospect ID" and "Lead Number" 
leads.drop(['Prospect ID', 'Lead Number'], 1, inplace = True)

In [None]:
leads.head()

### Dummy variable creation

The next step is to dealing with the categorical variables present in the dataset. So first take a look at which variables are actually categorical variables.

In [None]:
# Checking the columns which are of type 'object'

temp = leads.loc[:, leads.dtypes == 'object']
temp.columns

In [None]:
# Demo Cell
df = pd.DataFrame({'P': ['p', 'q', 'p']})
df

In [None]:
pd.get_dummies(df)

In [None]:
pd.get_dummies(df, prefix=['col1'])

In [None]:
# Creating dummy variables using the 'get_dummies' command
dummy = pd.get_dummies(df[['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                              'What is your current occupation', 'A free copy of Mastering The Interview', 
                              'Last Notable Activity']], drop_first=True)

# Add the results to the master DataFrame
df = pd.concat([df, dummy], axis=1)

# Display the updated DataFrame
print(df.head())

In [None]:
# Creating dummy variable separately for the variable 'Specialization' since it has the level 'Select' 
# which is useless so we
# drop that level by specifying it explicitly

dummy_spl = pd.get_dummies(leads['Specialization'], prefix = 'Specialization')
dummy_spl = dummy_spl.drop(['Specialization_Select'], 1)
leads = pd.concat([leads, dummy_spl], axis = 1)

In [None]:
# Dropping the variables for which the dummy variables have been created

leads = leads.drop(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                   'Specialization', 'What is your current occupation',
                   'A free copy of Mastering The Interview', 'Last Notable Activity'], 1)

In [None]:
# Let's take a look at the dataset again

leads.head()

### Test-Train Split

The next step is to spliting the dataset into training an testing sets.

In [None]:
# Importing the `train_test_split` library

In [None]:
# Put all the feature variables in X

X = leads.drop(['Converted'], 1)
X.head()

In [None]:
y = leads['Converted']

y.head()

In [None]:
# Spliting the dataset into 70% train and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
#lets check the shape
print("X_train Size", X_train.shape)
print("y_train Size", y_train.shape)

### Scaling

Now there are a few numeric variables present in the dataset which have different scales. So let's go ahead and scale these variables.

In [None]:
# Importing the 'MinMax scaler' Library

In [None]:
# Scaling the three numeric features present in the dataset

scaler = MinMaxScaler()

X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.fit_transform(X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

X_train.head()

### Looking at the correlations

Let's now look at the correlations. Since the number of variables are pretty high, it's better that we look at the table instead of plotting a heatmap

In [None]:
# Looking at the correlation table
plt.figure(figsize = (25,15))
sns.heatmap(leads.corr())
plt.show()


## Step 2: Model Building

Now, it's time to move on to model building. With the dataset containing numerous variables that we cannot handle, the most effective approach is to choose a small set of features from this pool of variables using RFE.

In [None]:
# Importing the 'LogisticRegression' and creating a LogisticRegression object
logreg = LogisticRegression()

In [None]:
# Importing the 'RFE' and select 15 variables

rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
# Let's take a look at which features have been selected by RFE

list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Putting all the columns selected by RFE in the variable 'col'

col = X_train.columns[rfe.support_]

We have successfully selected all the variables through RFE. As we prioritize the statistical aspect, specifically the p-values and VIFs, we shall utilize these variables to construct a robust logistic regression model with the aid of statsmodels.

In [None]:
# Select only the columns selected by RFE

X_train = X_train[col]

In [None]:
# Importing 'statsmodels'

### Model 1

In [None]:
# Fit a logistic Regression model on X_train after adding a constant and output the summary

X_train_sm = sm.add_constant(X_train)
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

There are quite a few variable which have a p-value greater than 0.05. We will need to take care of them. But first, let's also look at the VIFs.

### Checking `VIF`

In [None]:
# Importing the 'variance_inflation_factor' library

In [None]:
# Make a VIF dataframe for all the variables present

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

VIFs seem to be in a decent range except for three variables.

Let's first drop the variable `Lead Source_Reference` since it has a high p-value as well as a high VIF.

In [None]:
X_train.drop('Lead Source_Reference', axis = 1, inplace = True)

### Model 2

In [None]:
# Refit the model with the new set of features

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

#### Checking VIF

In [None]:
# Make a VIF dataframe for all the variables present

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

The VIFs are now all less than 5. So let's drop the ones with the high p-values beginning with `Last Notable Activity_Had a Phone Conversation`.

In [None]:
X_train.drop('Last Notable Activity_Had a Phone Conversation', axis = 1, inplace = True)

### Model 3

In [None]:
# Refit the model with the new set of features

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

Dropping the `What is your current occupation_Housewife` as having high P value

In [None]:
X_train.drop('What is your current occupation_Housewife', axis = 1, inplace = True)

### Model 4

In [None]:
# Refit the model with the new set of features

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

Droppint hre  `What is your current occupation_Working Professional` as having high P value

In [None]:
X_train.drop('What is your current occupation_Working Professional', axis = 1, inplace = True)

### Model 4

In [None]:
# Refit the model with the new set of features

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
res = logm1.fit()
res.summary()

#### Checking final VIF

In [None]:
# Making a VIF dataframe for all the variables present

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

## Step 3: Model Evaluation

The p-values and VIFs for all variables are satisfactory. Therefore, we can confidently proceed with making predictions using this final set of features.

In [None]:
# Use 'predict' to predict the probabilities on the train set

y_train_pred = res.predict(sm.add_constant(X_train))
y_train_pred[:10]

In [None]:
# Reshaping it into an array

y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

### Creating a dataframe with the actual conversion flag and the predicted probabilities

In [None]:
# Creating a new dataframe containing the actual conversion flag and the probabilities predicted by the model

y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final.head()

### Creating new column 'Predicted' with 1 if Paid_Prob > 0.5 else 0

In [None]:
y_train_pred_final['Predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

Now that you have the probabilities and have also made conversion predictions using them, it's time to evaluate the model.

In [None]:
# Importing the 'metrics' library from sklearn for evaluation

### Creating the `Confusion matrix`


In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy

print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Let's evaluate the other metrics as well

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Calculating the 'sensitivity'

TP/(TP+FN)

In [None]:
# Calculating the 'specificity'

TN/(TN+FP)

### Finding the Optimal Cutoff
0.5 was merely a rough estimate to evaluate the model's performance. To achieve optimal results, it is imperative to optimize the threshold. Therefore, let us begin by plotting a ROC curve to determine the AUC we can attain.

In [None]:
# ROC function

def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_train_pred_final.Converted,
                    y_train_pred_final.Conversion_Prob, 
                                         drop_intermediate=False)

In [None]:
# Importing the 'matplotlib'  to plot the ROC curve`

In [None]:
# Calling the ROC function

draw_roc(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

With an impressive area under the curve of 0.86, it's clear that our model is performing exceptionally well. Now, let's confidently explore the sensitivity and specificity tradeoff to determine the optimal cutoff point.

In [None]:
# Let's create columns with different probability cutoffs 

numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Let's create a dataframe to see the values of accuracy, sensitivity, and specificity at 
# different values of probabiity cutoffs

cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot it as well

cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

As you can see that around `0.42`, you get the optimal values of the three metrics. So let's choose 0.42 as our cutoff now.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map( lambda x: 1 if x > 0.42 else 0)

y_train_pred_final.head()

In [None]:
# Let's checking the `accuracy` now

metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
# Let's create the confusion matrix once again

confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

In [None]:
# Let's evaluate the other metrics as well

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Calculating the 'Sensitivity'

TP/(TP+FN)

In [None]:
# Calculating the 'Specificity'

TN/(TN+FP)

This cutoff point seems good to go!

## Step 4: Making Predictions on the Test Set
Let's now make predicitons on the test set

In [None]:
# Scaling the test set as well using just 'transform'

X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] =  scaler.transform(X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

In [None]:
# Selecting the columns in X_train for X_test as well

X_test = X_test[col]
X_test.head()

In [None]:
# Adding a constant to X_test

X_test_sm = sm.add_constant(X_test[col])

In [None]:
# Checking X_test_sm

X_test_sm

In [None]:
# Dropping the required columns from X_test as well

X_test.drop(['Lead Source_Reference', 'What is your current occupation_Housewife', 
             'What is your current occupation_Working Professional', 
                     'Last Notable Activity_Had a Phone Conversation'], 1, 
                                inplace = True)

In [None]:
# Make predictions on the test set and store it in the variable 'y_test_pred'

y_test_pred = res.predict(sm.add_constant(X_test))

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe

y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Let's see the head

y_pred_1.head()

In [None]:
# Converting y_test to dataframe

y_test_df = pd.DataFrame(y_test)

In [None]:
# Remove index for both dataframes to append them side by side 

y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Append y_test_df and y_pred_1

y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
# Check 'y_pred_final'

y_pred_final.head()

In [None]:
# Rename the column 

y_pred_final= y_pred_final.rename(columns = {0 : 'Conversion_Prob'})

In [None]:
# Let's see the head of y_pred_final

y_pred_final.head()

In [None]:
# Make predictions on the test set using 0.45 as the cutoff

y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.42 else 0)

In [None]:
# Check y_pred_final

y_pred_final.head()

In [None]:
# Let's check the overall accuracy

metrics.accuracy_score(y_pred_final['Converted'], y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final['Converted'], y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Calculating the 'sensitivity'
TP / float(TP+FN)

In [None]:
# Calculating the 'specificity'
TN / float(TN+FP)

### Precision-Recall View
Let's now also build the training model using the precision-recall view

In [None]:
#Looking at the confusion matrix again

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )
confusion

#### Precision = 
         TP / TP + FP

In [None]:
confusion[1,1]/(confusion[0,1]+confusion[1,1])

#### Recall = 
          TP / TP + FN

In [None]:
confusion[1,1]/(confusion[1,0]+confusion[1,1])

### Precision and recall tradeoff

Importing the `Precision recall curve` library

In [None]:
y_train_pred_final.Converted, y_train_pred_final.Predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.44 else 0)

y_train_pred_final.head()

In [None]:
# Let's checking the `accuracy` now

metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
# Let's creating the confusion matrix once again

confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

In [None]:
# Let's evaluate the other metrics as well

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

### Precision

In [None]:
TP/(TP+FP)

### Recall

In [None]:
TP/(TP+FN)

This cutoff point seems good to go!

## Step 5: Making Predictions on the Test Set
Let's now make predicitons on the test set.

In [None]:
# Making predictions on the test set and store it in the variable 'y_test_pred'

y_test_pred = res.predict(sm.add_constant(X_test))

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe

y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Let's see the head

y_pred_1.head()

In [None]:
# Converting y_test to dataframe

y_test_df = pd.DataFrame(y_test)

In [None]:
# Removing index for both dataframes to append them side by side 

y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Append y_test_df and y_pred_1

y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
# Checking the 'y_pred_final'

y_pred_final.head()

In [None]:
# Rename the column 

y_pred_final= y_pred_final.rename(columns = {0 : 'Conversion_Prob'})

In [None]:
# Let's see the head of y_pred_final

y_pred_final.head()

In [None]:
# Making predictions on the test set using 0.44 as the cutoff

y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.44 else 0)

In [None]:
# Checking y_pred_final

y_pred_final.head()

In [None]:
# Let's checking the overall accuracy

metrics.accuracy_score(y_pred_final['Converted'], y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final['Converted'], y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Calculating the Precision

TP/(TP+FP)

In [None]:
# Calculating Recall

TP/(TP+FN)