In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# visulaisation
from matplotlib.pyplot import xticks
%matplotlib inline

# Data display coustomization
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

### DATA PREPARATION

#### DATA LOADING

In [None]:
df = pd.read_csv('Leads.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#checking duplicates
sum(df.duplicated(subset = 'Prospect ID')) == 0

No duplicate

In [None]:
df.describe()

### DATA PROCESSING/CLEANING

#### CHANGE THE SELECT DATA IN THE FIELDS TO NaN

In [None]:
df = df.replace('Select',np.nan)

### CHECKING THE % OF NULL VALUES 

In [None]:
round(100*(df.isnull().sum())/len(df),2)

In [None]:
# we will drop the columns having more than 70% NA values.
df = df.drop(df.loc[:,list(round(100*(df.isnull().sum()/len(df.index)), 2)>70)].columns, 1)

In [None]:
# Lead Quality: 
#Indicates the quality of lead based on the data and intuition the the employee who has been assigned to the lead

In [None]:
df['Lead Quality'].describe()

In [None]:
sns.countplot(df['Lead Quality'])
plt.tick_params(axis='x',rotation=90)
plt.show()

In [None]:
# Imputing 'Not Sure' in the place where left blank(NaN) in the 'Lead Quality'

In [None]:
df['Lead Quality'] = df['Lead Quality'].replace(np.nan,'Not Sure')

In [None]:
sns.countplot(df['Lead Quality'])
plt.tick_params(axis='x',rotation=90)
plt.show()

In [None]:
fig, axs = plt.subplots(2,2, figsize = (10,7.5))
plt1 = sns.countplot(df['Asymmetrique Activity Index'], ax = axs[0,0])
plt2 = sns.boxplot(df['Asymmetrique Activity Score'], ax = axs[0,1])
plt3 = sns.countplot(df['Asymmetrique Profile Index'], ax = axs[1,0])
plt4 = sns.boxplot(df['Asymmetrique Profile Score'], ax = axs[1,1])
plt.tight_layout()
plt.show()

In [None]:
# There is too much variation in thes parameters so its not reliable to impute any value in it. 
# 45% null values means we need to drop these columns.
df = df.drop(['Asymmetrique Activity Index','Asymmetrique Activity Score','Asymmetrique Profile Index','Asymmetrique Profile Score'],1)

In [None]:
round(100*(df.isnull().sum()/len(df.index)), 2)

In [None]:
#City
df.City.describe()

In [None]:
sns.countplot(df.City)
plt.tick_params(axis='x',rotation=90)
plt.show()

In [None]:
# Around 60% of the data is Mumbai so we can impute Mumbai in the missing values.
df['City'] = df['City'].replace(np.nan, 'Mumbai')

In [None]:
# Specailization

In [None]:
df.Specialization.describe()

In [None]:
sns.countplot(df.Specialization)
plt.tick_params(axis='x',rotation=90)
plt.show()

In [None]:
#Makeing a category "Others" for missing values. 
df['Specialization'] = df['Specialization'].replace(np.nan, 'Others')

In [None]:
#Rechecking the null value
round(100*(df.isnull().sum()/len(df.index)), 2)

In [None]:
#TAGS

In [None]:
df.Tags.describe()

In [None]:
fig, axs = plt.subplots(figsize = (15,7.5))
sns.countplot(df.Tags)
plt.tick_params(axis='x',rotation=90)
plt.show()

In [None]:
# Imputing blanks with top value available in the data set
df['Tags'] = df['Tags'].replace(np.nan, 'Will revert after reading the email')

In [None]:
# What matters most to you in choosing a course

In [None]:
df['What matters most to you in choosing a course'].describe()

In [None]:
# Imputing the blanks with 'Better Career Prospects'
df['What matters most to you in choosing a course'] = df['What matters most to you in choosing a course'].replace(np.nan, 'Better Career Prospects')

In [None]:
#What is your current occupation
df['What is your current occupation'].describe()

In [None]:
#%of Unemployed
Unemployed = (5600/6550)*100
print(Unemployed)

In [None]:
#Nearly 86% is Unemployed so replacing the blanks with Unemployed
df['What is your current occupation'] = df['What is your current occupation'].replace(np.nan, 'Unemployed')

In [None]:
#COuntry
df.Country.describe()

In [None]:
# Country is India for most values so let's impute the same in missing values.
df['Country'] = df['Country'].replace(np.nan, 'India')

In [None]:
round(100*(df.isnull().sum()/len(df.index)), 2)

In [None]:
# Rest missing values are under 2% so we can drop these rows.
df.dropna(inplace = True)

In [None]:
round(100*(df.isnull().sum()/len(df.index)), 2)

In [None]:
df.to_csv('Leads_cleaned')

## Exploratory Data Analytics

### Univariate Analysis

#### Converetd

In [None]:
# Converted is the target variable, Indicates whether a lead has been successfully converted (1) or not (0)
Converted = (sum(df['Converted'])/len(df['Converted'].index))*100
Converted

#### Lead Origin

In [None]:
sns.countplot(x = "Lead Origin", hue = "Converted", data = df)
plt.tick_params(axis='x',rotation=90)
plt.show()

### Inference
 - API and Landing Page Submission have 30-35% conversion rate but count of lead originated from them are considerable.
 - Lead Add Form has more than 90% conversion rate but count of lead are not very high.
 - Lead Import are very less in count.
#### To improve overall lead conversion rate, we need to focus more on improving lead converion of API and Landing Page Submission origin and generate more leads from Lead Add Form.

### Lead Source

In [None]:
fig, axs = plt.subplots(figsize = (15,7.5))
sns.countplot(x = "Lead Source", hue = "Converted", data = df)
plt.tick_params(axis='x',rotation=90)
plt.show()

In [None]:
df['Lead Source'] = df['Lead Source'].replace(['google'], 'Google')
df['Lead Source'] = df['Lead Source'].replace(['Click2call', 'Live Chat', 'NC_EDM', 'Pay per Click Ads', 'Press_Release',
  'Social Media', 'WeLearn', 'bing', 'blog', 'testone', 'welearnblog_Home', 'youtubechannel'], 'Others')

In [None]:
sns.countplot(x = "Lead Source", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### Inference
- Google and Direct traffic generates maximum number of leads.
- Conversion Rate of reference leads and leads through welingak website is high.
#### To improve overall lead conversion rate, focus should be on improving lead converion of olark chat, organic search, direct traffic, and google leads and generate more leads from reference and welingak website.

### Do Not Email & Do Not Call

In [None]:
fig, axs = plt.subplots(1,2,figsize = (15,7.5))
sns.countplot(x = "Do Not Email", hue = "Converted", data = df, ax = axs[0])
sns.countplot(x = "Do Not Call", hue = "Converted", data = df, ax = axs[1])
plt.show()

#### Data Inspection

#### Total Visits

In [None]:
df['TotalVisits'].describe(percentiles=[0.05,.25, .5, .75, .90, .95, .99])

In [None]:
sns.boxplot(df['TotalVisits'])

In [None]:
# As we can see there are a number of outliers in the data.
# Capping the outliers to 95% value for analysis.

In [None]:
percentiles = df['TotalVisits'].quantile([0.05,0.95]).values
df['TotalVisits'][df['TotalVisits'] <= percentiles[0]] = percentiles[0]
df['TotalVisits'][df['TotalVisits'] >= percentiles[1]] = percentiles[1]

In [None]:
sns.boxplot(df['TotalVisits'])
plt.show()

In [None]:
sns.boxplot(y = 'TotalVisits', x = 'Converted', data = df)
plt.show()

### Inference
- Median for converted and not converted leads are the same.
- Nothing conclusive can be said on the basis of Total Visits

In [None]:
### Total time spent on website

In [None]:
df['Total Time Spent on Website'].describe()

In [None]:
sns.boxplot(df['Total Time Spent on Website'])
plt.show()

In [None]:
sns.boxplot(y = 'Total Time Spent on Website', x = 'Converted', data = df)
plt.show()

### Inference
- Leads spending more time on the weblise are more likely to be converted.
- Website should be made more engaging to make leads spend more time.

In [None]:
#### Page views per visit

In [None]:
df['Page Views Per Visit'].describe()

In [None]:
sns.boxplot(df['Page Views Per Visit'])

In [None]:
# As we can see there are a number of outliers in the data.
# We will cap the outliers to 95% value for analysis.

In [None]:
percentiles = df['Page Views Per Visit'].quantile([0.05,0.95]).values
df['Page Views Per Visit'][df['Page Views Per Visit'] <= percentiles[0]] = percentiles[0]
df['Page Views Per Visit'][df['Page Views Per Visit'] >= percentiles[1]] = percentiles[1]

In [None]:
sns.boxplot(df['Page Views Per Visit'])
plt.show()

In [None]:
sns.boxplot(y = 'Page Views Per Visit', x = 'Converted', data = df)
plt.show()

#### Inference
- Median for converted and unconverted leads is the same.
- Nothing can be said specifically for lead conversion from Page Views Per Visit

#### LAST ACTIVITY

In [None]:

df['Last Activity'].describe()

In [None]:
fig, axs = plt.subplots(figsize = (15,5))
sns.countplot(x = "Last Activity", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

In [None]:
# Keeping the considerable last activities as such and club all others to "Other_Activity"
df['Last Activity'] = df['Last Activity'].replace(['Had a Phone Conversation', 'View in browser link Clicked', 
                                                       'Visited Booth in Tradeshow', 'Approached upfront',
                                                       'Resubscribed to emails','Email Received', 'Email Marked Spam'], 'Other_Activity')

In [None]:
fig, axs = plt.subplots(figsize = (15,5))
sns.countplot(x = "Last Activity", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### Inference
 - Most of the lead have their Email opened as their last activity.
 - Conversion rate for leads with last activity as SMS Sent is almost 60%

In [None]:
### Country

In [None]:
df.Country.describe()

##### As most of them from INDIA so inference can be concluded

### Specialization

In [None]:
df.Specialization.describe()

In [None]:
df['Specialization'] = df['Specialization'].replace(['Others'], 'Other_Specialization')

In [None]:
fig, axs = plt.subplots(figsize = (15,5))
sns.countplot(x = "Specialization", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### Inference
- Focus should be more on the Specialization with high conversion rate

In [None]:
# What is your current occupation
# As the Unemployed is more converting Others to Other_Occupation

In [None]:
df['What is your current occupation'] = df['What is your current occupation'].replace(['Other'], 'Other_Occupation')

In [None]:
fig, axs = plt.subplots(figsize = (10,5))
sns.countplot(x = "What is your current occupation", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### Inference
- Working Professionals going for the course have high chances of joining it.
- Unemployed leads are the most in numbers but has around 30-35% conversion rate

#### What matters most to you in choosing a course

In [None]:
df['What matters most to you in choosing a course'].describe()

#### Inference
- Most entries are 'Better Career Prospects'. No Inference can be drawn with this paramete

In [None]:
print('--------------------------------')
print('Seach')
print(df.Search.describe())
print('--------------------------------')
print('Magazine')
print(df.Magazine.describe())
print('--------------------------------')
print('Newspaper Article')
print(df['Newspaper Article'].describe())
print('--------------------------------')
print('X Education Forums')
print(df['X Education Forums'].describe())
print('--------------------------------')
print('Newspaper')
print(df['Newspaper'].describe())
print('--------------------------------')
print('Digital Advertisement')
print(df['Digital Advertisement'].describe())
print('--------------------------------')
print('Through Recommendations')
print(df['Through Recommendations'].describe())
print('--------------------------------')
print('Receive More Updates About Our Courses')
print(df['Receive More Updates About Our Courses'].describe())
print('--------------------------------')
print('Get updates on DM Content')
print(df['Get updates on DM Content'].describe())
print('--------------------------------')
print('I agree to pay the amount through cheque')
print(df['I agree to pay the amount through cheque'].describe())
print('--------------------------------')

print('A free copy of Mastering The Interview')
print(df['A free copy of Mastering The Interview'].describe())
print('--------------------------------')

#### Inference
- Most entries are 'No'. No Inference can be drawn with this parameter.

#### Tags

In [None]:
fig, axs = plt.subplots(figsize = (15,5))
sns.countplot(x = "Tags", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

In [None]:
# Let's keep considerable last activities as such and club all others to "Other_Activity"
df['Tags'] = df['Tags'].replace(['In confusion whether part time or DLP', 'in touch with EINS','Diploma holder (Not Eligible)',
                                     'Approached upfront','Graduation in progress','number not provided', 'opp hangup','Still Thinking',
                                    'Lost to Others','Shall take in the next coming month','Lateral student','Interested in Next batch',
                                    'Recognition issue (DEC approval)','Want to take admission but has financial problems',
                                    'University not recognized'], 'Other_Tags')

In [None]:
fig, axs = plt.subplots(figsize = (10,5))
sns.countplot(x = "Tags", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### LEAD ORIGIN

In [None]:
df['Lead Origin'].describe()

In [None]:
fig, axs = plt.subplots(figsize = (10,5))
sns.countplot(x = "Lead Origin", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### Lead Quality

In [None]:
fig, axs = plt.subplots(figsize = (10,5))
sns.countplot(x = "Lead Quality", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

#### City

In [None]:
df.City.describe()

In [None]:
fig, axs = plt.subplots(figsize = (10,5))
sns.countplot(x = "City", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

### Inference
 - Most leads are from mumbai with around 30% conversion rate.

#### Last Notable Activity

In [None]:
df['Last Notable Activity'].describe()

In [None]:
fig, axs = plt.subplots(figsize = (10,5))
sns.countplot(x = "Last Notable Activity", hue = "Converted", data = df)
xticks(rotation = 90)
plt.show()

#### Results
- Based on the univariate analysis we found that many columns are not adding any information to the model, heance we can drop them for frther analysis

In [None]:
data = df.drop(['Lead Number','What matters most to you in choosing a course','Search','Magazine','Newspaper Article','X Education Forums','Newspaper',
           'Digital Advertisement','Through Recommendations','Receive More Updates About Our Courses','Update me on Supply Chain Content',
           'Get updates on DM Content','I agree to pay the amount through cheque','A free copy of Mastering The Interview','Country'],1)

In [None]:
data.shape

In [None]:
data.head()

### DATA PREPARATION

In [None]:
#### Converting some binary variables (Yes/No) to 1/0

In [None]:
# List of variables to map
varlist =  ['Do Not Email', 'Do Not Call']
# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

data[varlist] = data[varlist].apply(binary_map)

In [None]:
# Creating a dummy variable and dropping the first one.
dummy1 = pd.get_dummies(data[['Lead Origin', 'Lead Source', 'Last Activity', 'Specialization','What is your current occupation',
                              'Tags','Lead Quality','City','Last Notable Activity']], drop_first=True)
dummy1.head()

In [None]:
# Adding the results to the master dataframe
data = pd.concat([data, dummy1], axis=1)
data.head()

In [None]:
data = data.drop(['Lead Origin', 'Lead Source', 'Last Activity',
                  'Specialization','What is your current occupation','Tags',
                  'Lead Quality','City','Last Notable Activity'], axis = 1)

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split

# Putting feature variable to X
X = data.drop(['Prospect ID','Converted'], axis=1)

In [None]:
X.head()

In [None]:
# Putting response variable to y
y = data['Converted']

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

#### Feature Scaling

In [None]:
#### Using StandardScalar

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_train[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])
X_train.head()

In [None]:
# Checking the Churn Rate
Converted = (sum(data['Converted'])/len(data['Converted'].index))*100
print(Converted)
print('We have almost 38% conversion')

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (14,10))       
sns.heatmap(df.corr(),annot = True, cmap="YlGnBu")
plt.show()

In [None]:
# THERE IS NO highly correlated dummy variables

#### Model Building

In [None]:
import statsmodels.api as sm

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
# Feature Selection using RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

from sklearn.feature_selection import RFE
rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

In [None]:
#Assessing the model with StatsModels

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
col1 = col.drop('Tags_invalid number',1)

In [None]:
X_train_sm = sm.add_constant(X_train[col1])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
col2 = col1.drop('Tags_wrong number given',1)

In [None]:
X_train_sm = sm.add_constant(X_train[col2])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

#### Creating a dataframe with the actual churn flag and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Converted_prob':y_train_pred})
y_train_pred_final['Prospect ID'] = y_train.index
y_train_pred_final.head()

#### Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Converted_prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Predicted     not_churn    churn
# Actual
# not_churn        3756      149
# churn            363      2083

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.predicted))

### Checking the VIF's

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[col2].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col2].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train[col2].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col2].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Substituting the value of true positive
TP = confusion[1,1]
# Substituting the value of true negatives
TN = confusion[0,0]
# Substituting the value of false positives
FP = confusion[0,1] 
# Substituting the value of false negatives
FN = confusion[1,0]

In [None]:
# Calculating the sensitivity
TP/(TP+FN)

In [None]:
# Calculating the specificity
TN/(TN+FP)

###  Plotting the ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, 
                                         y_train_pred_final.Converted_prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Converted_prob)

###  Finding Optimal Cutoff Point
- Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Converted_prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    prob = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(prob))
    accuracy = (prob[0,0]+prob[1,1])/total1
    
    speci = prob[0,0]/(prob[0,0]+prob[0,1])
    sensi = prob[1,1]/(prob[1,0]+ prob[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Plotting it
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Converted_prob.map( lambda x: 1 if x > 0.2 else 0)
y_train_pred_final.head()

### Assinging Lead Score

In [None]:
y_train_pred_final['Lead_Score'] = y_train_pred_final.Converted_prob.map( lambda x: round(x*100))

y_train_pred_final.head()

In [None]:
# overall accuracy.
metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
#sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
#specificity of our logistic regression model
TN / float(TN+FP)

In [None]:
# Calculating the false postive rate - predicting churn when customer does not have churned
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

### Precision and Recall

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Precision
TP / TP + FP

confusion[1,1]/(confusion[0,1]+confusion[1,1])

In [None]:
#Recall
TP / TP + FN

confusion[1,1]/(confusion[1,0]+confusion[1,1])

In [None]:
from sklearn.metrics import precision_score,recall_score

In [None]:
precision_score(y_train_pred_final.Converted , y_train_pred_final.predicted)

In [None]:
recall_score(y_train_pred_final.Converted, y_train_pred_final.predicted)

### Precision and recall tradeoff

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
y_train_pred_final.Converted, y_train_pred_final.predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Converted_prob)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

### Making predictions on the test set

In [None]:
X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']] = scaler.fit_transform(X_test[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']])

X_train.head()

In [None]:
X_test = X_test[col2]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

### Making predictions on the test set

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
y_pred_1.head()

In [None]:
y_test_df = pd.DataFrame(y_test)

In [None]:
y_test_df['Prospect ID'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Converted_prob'})

In [None]:
y_pred_final = y_pred_final.reindex(['Prospect ID','Converted','Converted_prob'], axis=1)

In [None]:
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Converted_prob.map(lambda x: 1 if x > 0.2 else 0)

In [None]:
y_pred_final.head()

In [None]:
#overall accuracy.
metrics.accuracy_score(y_pred_final.Converted, y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Converted, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
#sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

### Conclusion
- It was found that the variables that mattered the most in the potential buyers are (In descending order) :
    - The total time spend on the Website.
    - Total number of visits.
    - Last Activity_SMS Sent
    - When the lead source was:
        - Google
        - Direct traffic
        - Oganic search
        - Welinak website

    - When the last activity was:
        - SMS
        - Olark chat conversation
    
- When the lead origin is Lead add format.
- When their current occupation is as a working professional.
- Keeping these in mind the X Education can flourish as they have a very high chance to get almost all the potential buyers to change their mind and buy their courses.
