In [None]:
# import to supress unnecessary warnings

import warnings
warnings.filterwarnings('ignore')

# Importing the NumPy and Pandas packages

import numpy as np
import pandas as pd

import time, warnings
import datetime as dt

In [None]:
#import sklearn libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import classification_report,recall_score,roc_auc_score,roc_curve,accuracy_score,precision_score,precision_recall_curve,confusion_matrix
from sklearn.preprocessing import LabelEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor

#import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

#import stats library
from scipy import stats
import statsmodels.api as sm

from IPython.display import display
pd.options.display.max_columns = None

## Step-1 :  Reading and Understanding the Data

In [None]:
# Reading data from Leads.csv file

lead_score_data = pd.read_csv(r'C:\Users\saksh\Downloads\Leads.csv')

In [None]:
lead_score_data.head()

In [None]:
# Lets check shape of data

lead_score_data.shape

#### The dataset has 9240 rows and 37 columns

In [None]:
# Lets see the types of the feature variables and the null values present using info()

lead_score_data.info()

##### By looking above info, it seems there are quite few categorical varaibles present in data, therefore, we need to create the dummy variables for same.

In [None]:
# Lets check the summary of the dataset

lead_score_data.describe()

#### Data Cleaning :
#####            1. Check missing or null values in dataset
#####            2. Drop unwanted columns from dataset

In [None]:
# check if any null values present in dataset columns

lead_score_data.isnull().sum()

In [None]:
# Checking the percentage of missing values

null_col = round(100*(lead_score_data.isnull().sum()/len(lead_score_data.index)), 2)
null_col

In [None]:
# Find the columns of dataset whose percentage of null values is greater than 30%

null_col = list(null_col[null_col.values >= 30.0].index)
null_col

#### -- Drop Irrelevant Columns 

In [None]:
# Dropping the unwanted columns having more than 30% null values in Dataset

lead_score_data.drop(labels = null_col , axis = 1 , inplace = True)

print(len(null_col))  # to print number of columns deleted from dataset

In [None]:
# Checking updated info of dataset

lead_score_data.info()

In [None]:
# Display the updated shape of Dataset

lead_score_data.shape

In [None]:
# print the value counts of all the columns

for col in lead_score_data:
    print(lead_score_data[col].astype('category').value_counts())
    print('___________________________________________________')

There are some columns that contains Select, which means that the student had not selected any option for that particular column therefore, it shows 'Select'.

In [None]:
# Select all non-numeric columns
lead_obj = lead_score_data.select_dtypes(include='object')

# Find out columns that have value "Select"

s = lambda x: x.str.contains('Select', na=False)
l = lead_obj.columns[lead_obj.apply(s).any()].tolist()
print (l)

#### Visualizing the columns with 'Select' values

In [None]:
# Lets figure out the total 'Select' value in columns using plot

def countplot(x, fig):
    plt.subplot(2,2, fig)
    sns.countplot(lead_score_data[x])
    plt.title('Count across'+' ('+ x + ')', size = 16)
    plt.xlabel(x,size = 14)
    plt.xticks(rotation = 90)

plt.figure(figsize=(15,10))

countplot('How did you hear about X Education',1)
countplot('Lead Profile',2)
countplot('Specialization',3)
countplot('City',4)


plt.tight_layout()

The above 4 columns now have the level 'Select'.

In [None]:
# Check value count of "Select" in column 'Lead Profile'

lead_score_data['Lead Profile'].astype('category').value_counts()

In [None]:
# Check value count of "Select" in column 'How did you hear about X Education'

lead_score_data['How did you hear about X Education'].value_counts()

In [None]:
# Check value count of "Select" in column 'Specialization'

lead_score_data['Specialization'].value_counts()

From above analysis, we can see that the column 'Lead Profile' and 'How did you hear about X Education' has highest number of rows with 'Select' values that can be consider same as missing values, therefore we can drop them as they are of no use for further analysis.

In [None]:
# Dropping 'Lead Profile' and 'How did you hear about X Education' column from dataset

lead_score_data.drop(['Lead Profile', 'How did you hear about X Education'], axis = 1, inplace = True)

In [None]:
# Checking the number of null values again

lead_score_data.isnull().sum().sort_values(ascending=False)

There are still 4 columns with huge null values. As they are important columns, removing them will cost us a lot of data. So, instead we are going to replace the NaN values with 'no data'. This way we have all the data with almost no null values. We can drop them off later, if they are of no use further.

In [None]:
lead_score_data['Specialization'] = lead_score_data['Specialization'].fillna('no data') 
lead_score_data['What matters most to you in choosing a course'] = lead_score_data['What matters most to you in choosing a course'].fillna('no data')
lead_score_data['Country'] = lead_score_data['Country'].fillna('no data')
lead_score_data['What is your current occupation'] = lead_score_data['What is your current occupation'].fillna('no data')
lead_score_data.info()

In [None]:
# Checking the number of null values again

lead_score_data.isnull().sum().sort_values(ascending=False)

##### As we can see above that "City" column still has maximum number of null values. So lets handle the null values of 'City'.

In [None]:
lead_score_data['City'].value_counts(normalize = True, dropna = False) * 100

In [None]:
# categorize all non-mumbai, but Maharashtra cities
lead_score_data.loc[(lead_score_data.City == 'Thane & Outskirts') | (lead_score_data.City == 'Other Cities of Maharashtra'), 'City'] = 'Non-Mumbai MH Cities'

# categorize all other cities
lead_score_data.loc[(lead_score_data.City == 'Other Cities') | (lead_score_data.City == 'Other Metro Cities') | (lead_score_data.City == 'Tier II Cities') , 'City'] = 'Non-MH Cities'

In [None]:
# replace 'Select' values to null values

lead_score_data['City'] = lead_score_data['City'].replace('Select', np.NaN)
lead_score_data['City'].head()

In [None]:
# Let check the updated value counts in column 'City'

lead_score_data.City.value_counts(normalize = True) * 100

In [None]:
# Lets impute proportionately

lead_score_data['City'] = lead_score_data.City.fillna(pd.Series(np.random.choice(['Mumbai', 'Non-Mumbai MH Cities','Non-MH Cities'], p = [0.5784, 0.2170, 0.2046 ], size = len(lead_score_data))))

In [None]:
# Lets check final updated column 'City'

lead_score_data['City'].value_counts()

#### 'What is your current occupation' Column

In [None]:
# Check value count of column 'What is your current occupation'

lead_score_data['What is your current occupation'].value_counts(normalize = True, dropna = False) * 100

##### Note : For occupation, lets combine categories first, then impute proportionally to maintain the distribution and not introduce bias

In [None]:
# combine low representing categories
lead_score_data.loc[(lead_score_data['What is your current occupation'] == 'Student') | (lead_score_data['What is your current occupation'] == 'Other') 
                            | (lead_score_data['What is your current occupation'] == 'Housewife') | 
                (lead_score_data['What is your current occupation'] == 'Businessman') , 'What is your current occupation'] = 'Student and Others'

In [None]:
# Lets check again the value count of updated column

lead_score_data['What is your current occupation'].value_counts(normalize = True) * 100

In [None]:
# replace 'no data' values to null values

lead_score_data['What is your current occupation'] = lead_score_data['What is your current occupation'].replace('no data', np.NaN)
lead_score_data['What is your current occupation']

In [None]:
# Lets check again the value count of updated column

lead_score_data['What is your current occupation'].value_counts(normalize = True) * 100

In [None]:
# Lets impute proportionately

lead_score_data['What is your current occupation'] = lead_score_data['What is your current occupation'].fillna(pd.Series(np.random.choice(['Unemployed', 'Working Professional', 
                                                        'Student and Others'], p = [0.8550, 0.1078, 0.0372], size = len(lead_score_data))))

In [None]:
# Lets check final value count of updated column

lead_score_data['What is your current occupation'].value_counts()

#### 'Specialization' Column

In [None]:
lead_score_data.Specialization.value_counts(normalize = True, dropna = False) * 100

##### Note : For specialization, combine categories based on the course type, and then impute proportionally to maintain the distribution and not introduce bias

In [None]:
# categorize all industry courses
lead_score_data.loc[(lead_score_data.Specialization == 'Banking, Investment And Insurance') | (lead_score_data.Specialization == 'Media and Advertising') |
       (lead_score_data.Specialization == 'Travel and Tourism') | (lead_score_data.Specialization == 'Services Excellence') |
       (lead_score_data.Specialization == 'E-COMMERCE'), 'Specialization'] = 'Industry Specializations'

# categorize all management courses
lead_score_data.loc[(lead_score_data.Specialization == 'Finance Management') | (lead_score_data.Specialization == 'Human Resource Management') | 
       (lead_score_data.Specialization == 'Marketing Management') |  (lead_score_data.Specialization == 'Operations Management') |
       (lead_score_data.Specialization == 'IT Projects Management') | (lead_score_data.Specialization == 'Supply Chain Management') |
       (lead_score_data.Specialization == 'Healthcare Management') | (lead_score_data.Specialization == 'Hospitality Management') |
       (lead_score_data.Specialization == 'Retail Management') , 'Specialization'] = 'Management Specializations'

# categorize all busines courses
lead_score_data.loc[(lead_score_data.Specialization == 'Business Administration') | (lead_score_data.Specialization == 'International Business') | 
       (lead_score_data.Specialization == 'Rural and Agribusiness') | (lead_score_data.Specialization == 'E-Business') 
        , 'Specialization'] = 'Business Specializations'

In [None]:
# Lets check the value count now

lead_score_data.Specialization.value_counts(normalize = True) * 100

In [None]:
# replace 'no data' and select values to null values

lead_score_data.Specialization = lead_score_data.Specialization.replace('no data', np.NaN)
lead_score_data.Specialization = lead_score_data.Specialization.replace('Select', np.NaN)
lead_score_data.Specialization

In [None]:
# Lets check again the value count of updated column

lead_score_data.Specialization.value_counts(normalize = True) * 100

In [None]:
# impute proportionately
lead_score_data['Specialization'] = lead_score_data.Specialization.fillna(pd.Series(np.random.choice(['Management Specializations', 'Business Specializations', 
                                        'Industry Specializations'], p = [0.7258, 0.1213, 0.1529 ], size = len(lead_score_data))))

In [None]:
# Lets check final value count of updated column

lead_score_data['Specialization'].value_counts()

#### 'What matters most to you in choosing a course' Column

In [None]:
lead_score_data['What matters most to you in choosing a course'].value_counts(normalize = True, dropna = False) * 100

##### Note : The distribution of the data is very heavily skewed, with Better career prospects + no data values = approx 100% of the total. It is safe to drop this column.

In [None]:
# Dropping the column 'What matters most to you in choosing a course'

lead_score_data.drop('What matters most to you in choosing a course', axis = 1, inplace = True)

#### 'Country' Column

In [None]:
# Lets find value count of 'Country' column

lead_score_data['Country'].value_counts(normalize = True, dropna = False) * 100

##### Note : The distribution of the data is very heavily skewed, with India + no data values = 97% of the total. It is safe to drop this column.

In [None]:
# dropping the column 'Country'

lead_score_data.drop('Country', axis = 1, inplace = True)

In [None]:
# Checking the number of null values again

lead_score_data.isnull().sum().sort_values(ascending=False)

There are 4 columns 'TotalVisits' , 'Page Views Per Visit' , 'Last Activity' and 'Lead Source' in dataset which still has some null values. Lets take care of them in further steps.

#### Handling the categorical columns with less number of missing values:
#####      1. Merge categories that have low representation of categories
#####      2. Impute the missing values

In [None]:
# lets find out the unique values for all object datatype columns

for a, b in lead_score_data.select_dtypes(include='object').nunique().to_dict().items():
    print('{} = {}'.format(a,b))

##### Note : From the above result, we can see that, the categorical columns with (number of unique values > 2) are:
#####     1.  'Lead Origin'
#####     2. 'Lead Source'

#### 'Lead Origin' Column

In [None]:
# Lets find value count of 'Lead Origin' column

lead_score_data['Lead Origin'].value_counts(normalize = True, dropna = False) * 100

##### Note : There are a lots of smaller values which won't be used as definitive factors, therefore, lets group them together

In [None]:
# combine low representing categories

lead_score_data.loc[(lead_score_data['Lead Origin'] == 'Lead Import') | (lead_score_data['Lead Origin'] == 'Quick Add Form') 
                    | (lead_score_data['Lead Origin'] == 'Lead Add Form'), 'Lead Origin'] = 'Lead Add Forms and Others'

# Lets check final value count of updated column

lead_score_data['Lead Origin'].value_counts()

#### 'Lead Source' Column

In [None]:
# Lets find value count of 'Lead Source' column

lead_score_data['Lead Source'].value_counts(normalize = True, dropna = False) * 100

In [None]:
# Lets impute the missing values with the mode of data i.e. clearly 'Google'

lead_score_data['Lead Source'].fillna(lead_score_data['Lead Source'].mode()[0], inplace=True)

##### Note : There are a lots of smaller values which won't be used as definitive factors, therefore, lets group them together

In [None]:
# combine low representing categories

lead_score_data['Lead Source'] = lead_score_data['Lead Source'].apply(lambda x: x if ((x== 'Google') | (x=='Direct Traffic') | (x=='Olark Chat') 
                                        |  (x=='Organic Search') | (x=='Reference') | (x=='Welingak Website'))else 'Other Social Sites')

# Lets check final value count of updated column

lead_score_data['Lead Source'].value_counts()

#### 'Last Activity' Column

In [None]:
# Lets find value count of 'Lead Source' column

lead_score_data['Last Activity'].value_counts(normalize = True, dropna = False) * 100

In [None]:
# Lets impute the missing values with the mode of data i.e. clearly 'Email Opened'

lead_score_data['Last Activity'].fillna(lead_score_data['Last Activity'].mode()[0], inplace=True)

In [None]:
lead_score_data['Last Activity'].value_counts()

#### Handling columns with Binary values :

#####  1. Drop those columns with significant data imbalance
#####  2. Drop the columns having only 1 unique entry

In [None]:
# lets find out the unique values for all object datatype columns

for a, b in lead_score_data.select_dtypes(include='object').nunique().to_dict().items():
    print('{} = {}'.format(a,b))

As per above result, the columns having just 1 unique values are :

1. Magazine
2. Receive More Updates About Our Courses
3. Update me on Supply Chain Content
4. Get updates on DM Content
5. I agree to pay the amount through cheque

##### Note : Above listed columns are having only one value majorly present for all the data points. Practically all of the values for these variables are 'No', therefore, we can drop these columns as they won't help with our analysis.

#### Data Imbalance Check :

In [None]:
# Lets take rest of the binary columns in a new dataframe

lead_score_bin = lead_score_data[['Do Not Email', 'Do Not Call', 'Search', 'Newspaper Article', 'X Education Forums', 
           'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'A free copy of Mastering The Interview']]

# Lets see value counts for each of the above listed columns

for i in lead_score_bin.columns:
    x = (lead_score_bin[i].value_counts(normalize = True)) * 100
    print(x)
    print()

##### Note : Except the column 'A free copy of Mastering The Interview' and 'Do Not Email', all above columns shows heavy data imbalance.

Because of heavy data imbalance, we are dropping the following columns:

1. Do Not Call
2. Search
3. Newspaper Article
4. X Education Forums
5. Newspaper
6. Digital Advertisement
7. Through Recommendations

In [None]:
lead_drop_bin = ['Do Not Email', 'Do Not Call', 'Search', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 
            'Through Recommendations', 'A free copy of Mastering The Interview', 'Magazine','Receive More Updates About Our Courses',
            'Update me on Supply Chain Content','Get updates on DM Content','I agree to pay the amount through cheque']

lead_score_data.drop(lead_drop_bin, axis = 1, inplace = True)

In [None]:
# Dropping the "Prospect ID" and "Lead Number" columns

lead_score_data.drop(['Prospect ID', 'Lead Number'], axis = 1, inplace = True)

##### Note : Above columns 'Prospect ID' and 'Lead Number' won't be of any use in the analysis as they had unique values for each rows, therefore, we drop them.

#### Handling Columns with Numerical Values:

#### 'TotalVisits' Column

In [None]:
# fill null values with median of data

lead_score_data.TotalVisits.fillna(lead_score_data.TotalVisits.median(), inplace=True)

# converting the datatype to integer as column 'TotalVisits' can't be decimal

lead_score_data.TotalVisits = lead_score_data.TotalVisits.astype('int')

In [None]:
lead_score_data.TotalVisits

#### 'Page Views Per Visit' Column

In [None]:
# fill null values with median of data

lead_score_data['Page Views Per Visit'].fillna(lead_score_data['Page Views Per Visit'].median(), inplace=True)

# Lets check final value count of updated column

lead_score_data['Page Views Per Visit']

In [None]:
# Checking the number of null values again

lead_score_data.isnull().sum().sort_values(ascending=False)

##### Note : Now there are no missing values left in columns of dataset. Lets check percentage of rows retained.

## Step-2 : EDA - Visualising the Data

In [None]:
# Lets print updated info of lead_score_data detaset

lead_score_data.info()

### 2.1 Numerical Variable Analysis

#### ~ Visualising the linear relationship of numerical variables in dataset :

In [None]:
# Plotting all the variables of dataset using pairplot

plt.figure(figsize=(13,13))
sns.pairplot(lead_score_data)
plt.show()

In [None]:
# Plotting numeric variables of dataset using pairplot w.r.t "Converted" 

lead = lead_score_data[['TotalVisits','Total Time Spent on Website','Page Views Per Visit','Converted']]
sns.pairplot(lead,diag_kind='kde',hue='Converted')
plt.show()

In [None]:
# Plotting the correlation for numerical variables using Heatmap

plt.figure(figsize=(7,5))
sns.heatmap(lead_score_data[['Converted','TotalVisits','Total Time Spent on Website','Page Views Per Visit']].corr(),cmap="RdYlGn",annot=True)
plt.title("Correlation between Numerical Variables")
plt.show()

##### Note : As we can't see any correlation among above data variables.

In [None]:
# describe the numeric variables to find outliers

num = lead_score_data[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']]
num.describe(percentiles=[0.25,0.5,0.75,0.9,0.99])

In [None]:
# capping at 99 percentile

lead_score_data['TotalVisits'].loc[lead_score_data['TotalVisits'] >= 
                            lead_score_data['TotalVisits'].quantile(0.99)] = lead_score_data['TotalVisits'].quantile(0.99)

lead_score_data['Page Views Per Visit'].loc[lead_score_data['Page Views Per Visit'] >= 
                            lead_score_data['Page Views Per Visit'].quantile(0.99)] = lead_score_data['Page Views Per Visit'].quantile(0.99)

##### Note : We can't find any major outliers in numeric variables of dataset.

### 2.2 Categorical Variable Analysis

#### ~ Plotting the relationship of categorical variables w.r.t 'Converted':

In [None]:
# Plot 'Lead Origin' and 'Lead Source' vs 'Converted'

plt.figure(figsize = (12,3))

plt.subplot(1,2,1)
sns.countplot(x='Lead Origin', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('Lead Origin')

plt.subplot(1,2,2)
sns.countplot(x='Lead Source', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('Lead Source')
plt.show()

In [None]:
# Plot 'Last Activity'  and  'Specialization'  vs  'Converted'

plt.figure(figsize = (12,3))

plt.subplot(1,2,1)
sns.countplot(x='Last Activity', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('Last Activity')

plt.subplot(1,2,2)
sns.countplot(x='Specialization', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('Specialization')
plt.show()

In [None]:
# Plot 'What is your current occupation'  and  'Last Notable Activity'  vs  'Converted'

plt.figure(figsize = (12,3))

plt.subplot(1,2,1)
sns.countplot(x='What is your current occupation', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('What is your current occupation')


plt.subplot(1,2,2)
sns.countplot(x='Last Notable Activity', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('Last Notable Activity')
plt.show()


In [None]:
# Plot 'City'  vs 'Converted'

plt.figure(figsize = (7,3))

sns.countplot(x='City', hue='Converted', data= lead_score_data).tick_params(axis='x', rotation = 90)
plt.title('City')
plt.show()

## Step-3 : Data Preparation

#### ~ Creating the dummies variables for categorical variables :

In [None]:
# Creating dummy variables for all categorical variables using 'get_dummies'.

dum_var = pd.get_dummies(lead_score_data[['Lead Origin' ,'Lead Source','Last Activity','Specialization', 'What is your current occupation',
                                          'City', 'Last Notable Activity']], drop_first=True)

#### 3.1 Data Formatting:

In [None]:
# print dataset before adding dummies

lead_score_data.head()

In [None]:
# Merging the 'lead_score_data' dataframe, with the dummy variables dataset and creating new updated dataframe

lead_score_new = pd.concat([lead_score_data, dum_var], axis=1)
lead_score_new

In [None]:
# print updated info for new dataset 'lead_score_new'

lead_score_new.info()

#### ~ Converting Binary (False/True) to 0/1

In [None]:
lead_bin = ['Lead Origin_Landing Page Submission','Lead Origin_Lead Add Forms and Others','Lead Source_Google','Lead Source_Olark Chat',
            'Lead Source_Organic Search','Lead Source_Other Social Sites','Lead Source_Reference','Lead Source_Welingak Website',
            'Last Activity_Converted to Lead','Last Activity_Email Bounced','Last Activity_Email Link Clicked','Last Activity_Email Marked Spam',
            'Last Activity_Email Opened','Last Activity_Email Received','Last Activity_Form Submitted on Website',
            'Last Activity_Had a Phone Conversation','Last Activity_Olark Chat Conversation','Last Activity_Page Visited on Website',
            'Last Activity_Resubscribed to emails','Last Activity_SMS Sent','Last Activity_Unreachable','Last Activity_Unsubscribed',
            'Last Activity_View in browser link Clicked','Last Activity_Visited Booth in Tradeshow','Specialization_Industry Specializations',
            'Specialization_Management Specializations','What is your current occupation_Unemployed','What is your current occupation_Working Professional',
            'City_Non-MH Cities','City_Non-Mumbai MH Cities','Last Notable Activity_Email Bounced','Last Notable Activity_Email Link Clicked',
            'Last Notable Activity_Email Marked Spam','Last Notable Activity_Email Opened','Last Notable Activity_Email Received',
            'Last Notable Activity_Form Submitted on Website','Last Notable Activity_Had a Phone Conversation','Last Notable Activity_Modified',
            'Last Notable Activity_Olark Chat Conversation','Last Notable Activity_Page Visited on Website','Last Notable Activity_Resubscribed to emails',
            'Last Notable Activity_SMS Sent','Last Notable Activity_Unreachable','Last Notable Activity_Unsubscribed',
            'Last Notable Activity_View in browser link Clicked']

In [None]:
# Defining the map function

def bin_map(x):
    return x.map({True: 1, False: 0})

# Applying the function to the housing list
lead_score_new[lead_bin] = lead_score_new[lead_bin].apply(bin_map)

# check the updated data
lead_score_new.head()

In [None]:
lead_score_new.info()

In [None]:
# Drop unnecessary columns as we have already created dummy variable for them.

lead_score_new.drop(['Lead Origin','Lead Source','Last Activity','Specialization', 'What is your current occupation',
                                          'City', 'Last Notable Activity'], axis = 1 , inplace = True)

In [None]:
# display updated data 

lead_score_new.info()

In [None]:
lead_score_new.shape

##  Step-4 : Splitting Data into train and test 

In [None]:
# dropping "Converted" from X_train 

X = lead_score_new.drop(['Converted'], axis = 1)
X.head()

In [None]:
# Putting the target variable in Y

Y = lead_score_new['Converted']
Y.head()

In [None]:
# Spliting the dataset into 70% train and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=100)

#### ~ Rescaling the Features using MinMaxScaler

In [None]:
# Scale the three numeric features

scaler = MinMaxScaler()
X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.fit_transform(X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])
X_train.head()

In [None]:
# Checking the correlation coefficients to find out which variables are highly correlated

plt.figure(figsize = (40,40))
sns.heatmap(X_train.corr(), annot = True, cmap="RdYlGn")
plt.show()

#### Note:
1. 'Last Notable Activity_Resubscribed to emails' and 'Last Activity_Resubscribed to emails' have strong correlation.
2. 'Last Notable Activity_Email Marked Spam' and 'Last Activity_Email Marked Spam' is highly correlated .

Lets perform RFE to list out the significant columns

## Step-5  :  Building the linear Models for Train set

#### Building Model using RFE(Recursive Feature Elimination)

In [None]:
# Perform Recursive Feature Elimination 

logreg = LogisticRegression()

In [None]:
# Running RFE with 15 variables as output

rfe = RFE(logreg,n_features_to_select= 15)
rfe = rfe.fit(X_train, y_train)

In [None]:
# Features that have been selected by RFE

list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Put all the columns selected by RFE in the variable 'col'
col = X_train.columns[rfe.support_]

In [None]:
# Display list of features rejected by RFE

X_train.columns[~rfe.support_]

In [None]:
# Selecting columns selected by RFE

X_train = X_train[col]

### Building Model - 1

In [None]:
X_train_sm = sm.add_constant(X_train)
logm1 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm1.fit()
res.summary()

#### Calculate VIF - Model 1

In [None]:
# Calculate VIF for 1st model

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Note :
The above model shows that 'Lead Origin_Lead Add Forms and Others' variable has very high VIF value which is insignificant and needs to be dropped.

In [None]:
X_train.drop('Lead Origin_Lead Add Forms and Others', axis = 1, inplace = True)

### Building Model - 2

In [None]:
# Refit the model with the new set of features

logm2 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm2.fit().summary()

#### Calculate VIF - Model 2

In [None]:
# Calculate VIF for 2nd model

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Note :
The VIFs are now all less than 5. So let's drop the ones with the high p-values beginning with Last Notable Activity_Had a Phone Conversation.

In [None]:
# Dropping 'Last Notable Activity_Had a phone conversation' column from dataset

X_train.drop('Last Notable Activity_Had a Phone Conversation', axis = 1, inplace = True)

### Building Model - 3

In [None]:
# Lets build model-3

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
# Dropping 'Last Notable Activity_Unreachable' column from dataset

X_train.drop('Last Notable Activity_Unreachable', axis = 1, inplace = True)

### Building Model - 4

In [None]:
# Lets build model-4

logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

#### Calculate VIF - Model 4

In [None]:
# Calculate VIF for model - 4

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Note : Now we can finalize this linear model-3 for further analysis due to following points:

1. VIFs for all features are less than 5.
2. p-values are almost zero for all variables.
3. Overall linear model has become significant now.

#### Now let's make predictions using this final set of features.

## Step-7  :  Predictions and Model Evaluation 

### 7.1 Prediction on Train dataset

In [None]:
# predict the probabilities on the train set using 'predict'

y_train_pred = res.predict(sm.add_constant(X_train_sm))
y_train_pred[:10]

In [None]:
# Reshape it into an array

y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

#### ~ Creating a dataframe with the predicted probabilities and the actual conversion flag

In [None]:
# Lets create a new dataframe containing the actual conversion flag and the probabilities predicted by the model

y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final.head()

#### ~ Create New column 'Predicted' with 1 if Paid_Prob > 0.5 else 0

In [None]:
y_train_pred_final['Predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

##### Now that you have made conversion predictions and also have the probabilities using them,next step is to evaluate the model.

#### ~ Create Confusion Matrix

In [None]:
# Creating confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )
confusion

#### ~ Accuracy

In [None]:
# Check the overall accuracy
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted)

##### Note :
We got Accuracy around 81% which is a good value. Let's evaluate the other metrics as well.

In [None]:
# true positive
TP = confusion[1,1]
# true negatives
TN = confusion[0,0]
# false positives
FP = confusion[0,1] 
# false negatives
FN = confusion[1,0]

#### ~ Sensitivity

In [None]:
# Calculating the sensitivity
TP/(TP+FN)

#### ~ Specificity

In [None]:
# Calculating the specificity
TN/(TN+FP)

The current cut-off to loosely check the model performance was 0.5, with this we have around:
##### 81% Accuracy
##### 70% Sensitivity
##### 87% Specificity

### Create Optimal Cut-Off (ROC) curve

In [None]:
# Create ROC function

def plot_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob, drop_intermediate = False )

In [None]:
# Call the ROC function

plot_roc(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

#### Note :

The area shown under the curve of the ROC is 0.87 which is a good value to have a good model. 
Let's also check the sensitivity and specificity tradeoff to find the optimal cutoff point.

In [None]:
# Creating columns with different probability cutoffs 

num = [float(x)/10 for x in range(10)]
for i in num:
    y_train_pred_final[i]= y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

#### ~ Accuracy, Senstivity and Specificity with different probability cut-off values

In [None]:
# Creating a dataframe to see the values of accuracy, sensitivity, and specificity at different values of probabiity cutoffs
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

# Making confusing matrix to find values of sensitivity, accurace and specificity for each level of probablity

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
cutoff_df

In [None]:
# Plot the matrix for different probability cutoff

cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### Note :

From above graph its visible that now the optimal cutoff is around 0.35.

#### ~ Create New column 'Final_Predicted' with 1 if Paid_Prob > 0.35 else 0

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map( lambda x: 1 if x > 0.35 else 0)
y_train_pred_final.head()

#### ~ Accuracy for 'final_predicted'

In [None]:
# Check the overall accuracy

metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
# Creating confusion matrix 

confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

In [None]:
# true positive
TP = confusion2[1,1]
# true negatives
TN = confusion2[0,0]
# false positives
FP = confusion2[0,1] 
# false negatives
FN = confusion2[1,0]

#### ~ Sensitivity

In [None]:
# Calculating the sensitivity
TP/(TP+FN)

#### ~ Specificity

In [None]:
# Calculating the specificity
TN/(TN+FP)

#### Note :

With current Cutoff of 0.35, we got matrix around :

##### Accuracy - 80%
##### Senstivity - 80%
##### Specificity - 79%

### 7.2 Prediction on Test set

In [None]:
# Scale the numeric values of test set using just 'transform'

X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.transform(X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

In [None]:
# Select the columns in X_train for X_test as well
X_test = X_test[col]


In [None]:
# Add a constant to X_test
X_test_sm = sm.add_constant(X_test[col])
X_test_sm
X_test_sm

In [None]:
# Storing prediction of test set in the variable 'y_test_pred'
y_test_pred = res.predict(X_test_sm)
# Coverting it to df
y_pred_df = pd.DataFrame(y_test_pred)
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)
# Remove index for both dataframes to append them side by side 
y_pred_df.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)
# Append y_test_df and y_pred_df
y_pred_final = pd.concat([y_test_df, y_pred_df],axis=1)
# Renaming column 
y_pred_final= y_pred_final.rename(columns = {0 : 'Conversion_Prob'})
y_pred_final.head()

#### ~ Prediction using cut off 0.35

In [None]:
# Making prediction using cut off 0.35

y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.35 else 0)
y_pred_final

#### ~ Accuracy

In [None]:
# Check the overall accuracy

metrics.accuracy_score(y_pred_final['Converted'], y_pred_final.final_predicted)

#### ~ Create Confusion Matrix

In [None]:
# Creating confusion matrix 

conf2 = metrics.confusion_matrix(y_pred_final['Converted'], y_pred_final.final_predicted )
conf2

In [None]:
# true positive
TP = conf2[1,1]

# true negatives
TN = conf2[0,0]

# false positives
FP = conf2[0,1]

# false negatives
FN = conf2[1,0]

#### ~ Sensitivity

In [None]:
# Calculating the sensitivity

TP/(TP+FN)

#### ~ Specificity

In [None]:
# Calculating the specificity

TN/(TN+FP)

##### Note : With the current cut off as 0.35 we have accuracy, sensitivity and specificity of around 80%

## Step-8  :  Precision-Recall 

### 8.1 Prediction on Train set using Precision-Recall View

#### ~ Create Confusion Matrix

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )
confusion

#### ~ Precision

In [None]:
# Precision = TP / TP + FP

confusion[1,1]/(confusion[0,1]+confusion[1,1])

#### ~ Recall

In [None]:
#Recall = TP / TP + FN

confusion[1,1]/(confusion[1,0]+confusion[1,1])

### 8.1.1 Precision and recall tradeoff

In [None]:
y_train_pred_final.Converted, y_train_pred_final.Predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

In [None]:
# Plot thresholds for Precision and Recall

plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

#### Note :

From above graph its visible that now the optimal cutoff is around 0.43.

#### ~ Create New column 'Final_Predicted' with 1 if Paid_Prob > 0.43 else 0

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.41 else 0)
y_train_pred_final.head()

#### ~ Create Confusion Matrix

In [None]:
# Creating confusion matrix again

conf2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
conf2

#### ~ Accuracy

In [None]:
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
# true positive
TP = conf2[1,1]

# true negatives
TN = conf2[0,0]

# false positives
FP = conf2[0,1] 

# false negatives
FN = conf2[1,0]

#### ~ Precision

In [None]:
# Precision = TP / TP + FP

TP / (TP + FP)

#### ~ Recall

In [None]:
#Recall = TP / TP + FN

TP / (TP + FN)

##### Note : With the current cut off as 0.43, both Precision and Recall around 74% and 76%

### 8.2 Prediction on Test set using Precision-Recall View

In [None]:
# Make predictions on the test set and store it in the variable 'Y_test_pred'

y_test_pred = res.predict(sm.add_constant(X_test))

In [None]:
y_test_pred[:10]

In [None]:
# Converting Y_pred to a dataframe

y_pred = pd.DataFrame(y_test_pred)
y_pred.head()

In [None]:
# Converting y_test to dataframe

lead_y_test = pd.DataFrame(y_test)

In [None]:
# Removing index for both dataframes to append them side by side 

y_pred.reset_index(drop=True, inplace=True)
lead_y_test.reset_index(drop=True, inplace=True)

In [None]:
# Append lead_y_test and y_pred

y_pred_final = pd.concat([lead_y_test, y_pred],axis=1)

# Checking the 'y_pred_final'

y_pred_final.head()

In [None]:
# Rename the column 

y_pred_final= y_pred_final.rename(columns = {0 : 'Conversion_Prob'})

# Let's see the head of y_pred_final

y_pred_final.head()

#### ~ Create New column 'final_predicted' with 1 if Paid_Prob > 0.43 else 0

In [None]:
# Making predictions on the test set using 0.43 as the cutoff

y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.43 else 0)
y_pred_final

#### ~ Create Confusion Matrix

In [None]:
# Creating confusion matrix 

confusion2 = metrics.confusion_matrix(y_pred_final['Converted'], y_pred_final.final_predicted )
confusion2

#### ~ Accuracy

In [None]:
# Check the overall accuracy

metrics.accuracy_score(y_pred_final['Converted'], y_pred_final.final_predicted)

In [None]:
# true positive
TP = confusion2[1,1]

# true negatives
TN = confusion2[0,0]

# false positives
FP = confusion2[0,1]

# false negatives
FN = confusion2[1,0]

#### ~ Precision

In [None]:
# Precision = TP / TP + FP

TP / (TP + FP)

#### ~ Recall

In [None]:
#Recall = TP / TP + FN

TP / (TP + FN)

##### Note : With the current cut off as 0.43, Precision is around 76% and Recall around 72% with 80% Accuracy

## Conclusion :

#### There are some significant variables that mattered the most in identifying the promising "Hot Leads" for company:

1. "TotalVisits" : The total number of visits made by the customer on the website.
2. "Total Time Spent on Website"
3. "Page Views Per Visit" : Average number of pages on the website viewed during the visits.
4. "Lead Source" : The source of Lead can be, includes a. Google
b. Olark Chat
c. Welingak website
d. Reference
5. "Last Activity" : Majority of last activity performed by customer can be
a. Olark Chat Conversation
b. Converted to Lead
c. Email Bounced
d. Had a Phone Conversation
6. "Last Notable Activity" : The last notable activity performed by the customer can be SMS Sent
7. The Leads having Current Occupation as "Working Professional".

#### The X Education can flourish their high conversion chance by keeping above factors in mind and can get almost all the potential buyers to become their Converted Leads and buy their courses.