# **Import Packages**

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# **Import Data File**

In [None]:
# upload CSV data file
from google.colab import files
uploaded = files.upload()

Saving lead_scoring.csv to lead_scoring.csv


In [None]:
# store data in Pandas DataFrame
import io
data = pd.read_csv(io.BytesIO(uploaded['lead_scoring.csv']))

# **Data Preparation**

## Overview

In [None]:
# show all columns in Data Frame
data.columns

Index(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about Madugital',
       'What is your current occupation',
       'What matters most to you in choosing a product', 'Search', 'Magazine',
       'Newspaper Article', 'Madugital Telegram', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Products', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')

In [None]:
# show data type for each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9204 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9103 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

In [None]:
data.sample(10)

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
5053,d0a557fd-64b5-4c1a-8096-d846caf6b6ea,611308,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,,,,,No,No,Modified
3471,5a59cc5a-544f-44be-85be-6d49576cead0,627135,API,Organic Search,No,No,1,4.0,1132,4.0,...,No,Select,Select,,,,,No,No,SMS Sent
5953,ebc5caeb-d4af-4c60-b4ba-a3e8ca8d60c9,604185,API,Olark Chat,No,No,1,0.0,0,0.0,...,No,Select,Select,,,,,No,No,SMS Sent
380,701994c7-5df0-4b87-9a44-96a96f3acefb,656448,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,01.High,02.Medium,17.0,15.0,No,No,Modified
4906,76dd2278-7b51-4dd6-8cfb-e0c19312f0b9,612899,API,Google,No,No,0,2.0,102,2.0,...,No,,,,,,,No,No,SMS Sent
4837,e682bb80-a2dc-4040-8471-ae861beb5165,613633,API,Google,No,No,1,3.0,1510,3.0,...,No,Potential Lead,Other Metro Cities,,,,,No,No,Email Opened
6285,8a990d16-163a-4798-bf59-6bb4b9fc7f01,601640,API,Google,No,No,0,3.0,92,3.0,...,No,Select,Select,,,,,No,No,Modified
4303,68dd72db-5712-4fa3-af45-baa547a6bfc7,618564,API,Google,No,No,0,3.0,611,3.0,...,No,,,01.High,02.Medium,16.0,15.0,No,No,Olark Chat Conversation
2476,2129220e-e309-47a4-88c6-093c41701e78,635908,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,03.Low,02.Medium,11.0,15.0,No,No,Modified
8620,55537290-9985-4363-82af-08c75d838de4,583960,API,Olark Chat,No,No,1,2.0,1122,1.0,...,No,Potential Lead,Jakarta,02.Medium,01.High,13.0,18.0,No,Yes,Modified


## Quality Check

In [None]:
# check record & column counts
data.shape

(9240, 37)

In [None]:
# check data distribution
data.describe() # for all numeric data type (int, float)
# data.describe(include=['O']) # for all categoric data (object)

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9103.0,9240.0,9103.0,5022.0,5022.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.854853,548.021466,2.161418,1.386694,1.811395
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,15.0
50%,615479.0,0.0,3.0,248.0,2.0,14.0,16.0
75%,637387.25,1.0,5.0,936.0,3.0,15.0,18.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [None]:
# check unique values in categoric (object) data type
data['I agree to pay the amount through cheque'].unique() # change the values in ['xxx'] for each desired column

array(['No'], dtype=object)

In [None]:
# check empty columns (if more than 60% empty, the column(s) will be dropped otherwise it'll be filled)
# data.isnull().sum() # counts
data.isna().mean() * 100 # percentage

Prospect ID                                        0.000000
Lead Number                                        0.000000
Lead Origin                                        0.000000
Lead Source                                        0.389610
Do Not Email                                       0.000000
Do Not Call                                        0.000000
Converted                                          0.000000
TotalVisits                                        1.482684
Total Time Spent on Website                        0.000000
Page Views Per Visit                               1.482684
Last Activity                                      1.114719
Country                                           26.634199
Specialization                                    15.562771
How did you hear about Madugital                  23.885281
What is your current occupation                   29.112554
What matters most to you in choosing a product    29.318182
Search                                  

In [None]:
# check duplicated values (if any)
data.loc[data.duplicated()]

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity


### Formating Data

In [None]:
data_filled = data.copy(deep=True)

In [None]:
# formating column value(s)
data_filled['Lead Source'] = data_filled['Lead Source'].str.title() # title text format
data_filled['Lead Source'] = data_filled['Lead Source'].str.replace('Youtubechannel', 'YouTube') # replace text with more suitable text
data_filled['Last Activity'] = data_filled['Last Activity'].str.capitalize() # capitalize text
data_filled['Country'] = data_filled['Country'].str.title() # title text format
data_filled['Specialization'] = data_filled['Specialization'].replace('Select', np.nan, regex=True) # replace text with nan value
data_filled['How did you hear about Madugital'] = data_filled['How did you hear about Madugital'].replace('Select', np.nan, regex=True) # replace text with nan value
data_filled['Lead Profile'] = data_filled['Lead Profile'].replace('Select', np.nan, regex=True) # replace text with nan value
data_filled['City'] = data_filled['City'].replace('Select', np.nan, regex=True) # replace text with nan value

### Handling Missing Values

In [None]:
# fill null values using modes (nilai modus), mean (nilai rata-rata), bffill / ffill, user defined value or drop it
data_filled['Lead Source'] = data_filled['Lead Source'].fillna(data_filled['Lead Source'].value_counts().idxmax()) # modes
data_filled['TotalVisits'] = data_filled['TotalVisits'].fillna(data_filled['TotalVisits'].mean()) # mean
data_filled['Page Views Per Visit'] = data_filled['Page Views Per Visit'].fillna(data_filled['Page Views Per Visit'].mean()) # mean
data_filled['Last Activity'] = data_filled['Last Activity'].fillna(data_filled['Last Activity'].value_counts().idxmax()) # modes
data_filled['Country'] = data_filled['Country'].fillna('Unknown') # user defined
data_filled['Specialization'] = data_filled['Specialization'].fillna('Unknown') # user defined
data_filled = data_filled.drop(columns=['How did you hear about Madugital'], axis=1) # drop
data_filled['What is your current occupation'] = data_filled['What is your current occupation'].fillna('Unknown') # user defined
data_filled['What matters most to you in choosing a product'] = data_filled['What matters most to you in choosing a product'].fillna('Other') # user defined
data_filled['Tags'] = data_filled['Tags'].fillna(method='ffill') # ffill
data_filled['Lead Quality'] = data_filled['Lead Quality'].fillna('Not Sure') # user defined
data_filled['Lead Profile'] = data_filled['Lead Profile'].fillna('Other Leads') # user defined
data_filled['City'] = data_filled['City'].fillna('Other Cities') # user defined
data_filled['Asymmetrique Activity Index'] = data_filled['Asymmetrique Activity Index'].fillna(method='ffill') # ffill
data_filled['Asymmetrique Profile Index'] = data_filled['Asymmetrique Activity Index'].fillna(method='ffill') # ffill
data_filled['Asymmetrique Activity Score'] = data_filled['Asymmetrique Activity Score'].fillna(data_filled['Asymmetrique Activity Score'].mean()) # mean
data_filled['Asymmetrique Profile Score'] = data_filled['Asymmetrique Profile Score'].fillna(data_filled['Asymmetrique Profile Score'].mean()) # mean

In [None]:
# check updated null values
data_filled.isna().mean() * 100 # percentage

Prospect ID                                       0.0
Lead Number                                       0.0
Lead Origin                                       0.0
Lead Source                                       0.0
Do Not Email                                      0.0
Do Not Call                                       0.0
Converted                                         0.0
TotalVisits                                       0.0
Total Time Spent on Website                       0.0
Page Views Per Visit                              0.0
Last Activity                                     0.0
Country                                           0.0
Specialization                                    0.0
What is your current occupation                   0.0
What matters most to you in choosing a product    0.0
Search                                            0.0
Magazine                                          0.0
Newspaper Article                                 0.0
Madugital Telegram          

## **Manipulation**

In [None]:
# check updated data distribution
data_filled.describe() # for all numeric data type (int, float)
# data_filled.describe(include=['O']) # for all categoric data (object)

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score
count,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883
std,23405.995698,0.486714,4.818723,548.021466,2.145333,1.022265,1.335352
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,16.0
50%,615479.0,0.0,3.0,248.0,2.0,14.306252,16.344883
75%,637387.25,1.0,5.0,936.0,3.0,14.306252,16.344883
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0


In [None]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 36 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9240 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9240 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

### Binning

In [None]:
# bin column(s) that has big number of standard deviation into categoric value
data_filled['Time Spent Category'] = np.where(data_filled['Total Time Spent on Website']<240, 'Casual Browsing', 'Moderate Browsing')
data_filled['Time Spent Category'] = np.where(data_filled['Total Time Spent on Website']>=960, 'Intense Browsing', data_filled['Time Spent Category'])

In [None]:
# show sample
data_filled.sample(5, random_state=77)

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity,Time Spent Category
2832,f10a1d92-7058-4c09-8d45-d2473371ba3c,632515,Landing Page Submission,Google,No,No,0,4.0,444,4.0,...,Other Leads,Jakarta,02.Medium,02.Medium,14.306252,16.344883,No,No,Email Opened,Moderate Browsing
4985,7022733e-82e0-4047-b6c3-571c3b842eeb,612062,Landing Page Submission,Google,No,No,0,5.0,401,5.0,...,Other Leads,Jakarta,02.Medium,02.Medium,14.306252,16.344883,No,No,SMS Sent,Moderate Browsing
4187,9343e959-299b-472a-9bbd-0d62ded4ef57,619649,Landing Page Submission,Google,No,No,1,4.0,765,4.0,...,Potential Lead,Cities in Banten & Jawa Barat,02.Medium,02.Medium,14.306252,16.344883,No,No,SMS Sent,Moderate Browsing
92,72064169-4b55-4279-a9b9-340872379f12,659600,Landing Page Submission,Google,No,No,0,4.0,101,4.0,...,Other Leads,Jakarta,02.Medium,02.Medium,15.0,18.0,No,No,Email Opened,Casual Browsing
6811,dd045082-a936-40ee-88e9-223e605cc40a,597516,API,Google,No,No,1,2.0,1416,1.0,...,Other Leads,Other Cities,02.Medium,02.Medium,14.306252,16.344883,No,No,SMS Sent,Intense Browsing


### Clipping

In [None]:
# clip column(s) with Box Plot
# column 1
Q1 = data_filled['TotalVisits'].quantile(0.25)
Q3 = data_filled['TotalVisits'].quantile(0.75)
IQR = Q3 - Q1
Lwishker = Q1 - 1.5 * IQR
Uwishker = Q3 + 1.5 * IQR
data_filled['Clipped TotalVisits'] = data_filled['TotalVisits'].clip(Lwishker, Uwishker)
# column 2
Q1 = data_filled['Total Time Spent on Website'].quantile(0.25)
Q3 = data_filled['Total Time Spent on Website'].quantile(0.75)
IQR = Q3 - Q1
Lwishker = Q1 - 1.5 * IQR
Uwishker = Q3 + 1.5 * IQR
data_filled['Clipped Total Time Spent on Website'] = data_filled['Total Time Spent on Website'].clip(Lwishker, Uwishker)
# column 3
Q1 = data_filled['Page Views Per Visit'].quantile(0.25)
Q3 = data_filled['Page Views Per Visit'].quantile(0.75)
IQR = Q3 - Q1
Lwishker = Q1 - 1.5 * IQR
Uwishker = Q3 + 1.5 * IQR
data_filled['Clipped Page Views Per Visit'] = data_filled['Page Views Per Visit'].clip(Lwishker, Uwishker)

In [None]:
# check updated data distribution
data_filled.describe()

Unnamed: 0,Lead Number,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score,Clipped TotalVisits,Clipped Total Time Spent on Website,Clipped Page Views Per Visit
count,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0,9240.0
mean,617188.435606,0.38539,3.445238,487.698268,2.36282,14.306252,16.344883,3.224026,487.698268,2.260484
std,23405.995698,0.486714,4.818723,548.021466,2.145333,1.022265,1.335352,2.860475,548.021466,1.77924
min,579533.0,0.0,0.0,0.0,0.0,7.0,11.0,0.0,0.0,0.0
25%,596484.5,0.0,1.0,12.0,1.0,14.0,16.0,1.0,12.0,1.0
50%,615479.0,0.0,3.0,248.0,2.0,14.306252,16.344883,3.0,248.0,2.0
75%,637387.25,1.0,5.0,936.0,3.0,14.306252,16.344883,5.0,936.0,3.0
max,660737.0,1.0,251.0,2272.0,55.0,18.0,20.0,11.0,2272.0,6.0


### Encoding

In [None]:
# encode object data type
dummies = pd.get_dummies(data_filled[['Lead Origin','Lead Source','Do Not Email','Do Not Call','Last Activity','Country','Specialization','What is your current occupation','What matters most to you in choosing a product','Search','Magazine','Newspaper Article','Madugital Telegram','Newspaper','Digital Advertisement','Through Recommendations','Receive More Updates About Our Products','Tags','Lead Quality','Update me on Supply Chain Content','Get updates on DM Content','Lead Profile','City','Asymmetrique Activity Index','Asymmetrique Profile Index','I agree to pay the amount through cheque','A free copy of Mastering The Interview','Last Notable Activity','Time Spent Category']], drop_first=True)

In [None]:
# show sample
dummies.sample(5)

Unnamed: 0,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Blog,Lead Source_Click2Call,Lead Source_Direct Traffic,Lead Source_Facebook,Lead Source_Google,Lead Source_Live Chat,...,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_Resubscribed to emails,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked,Time Spent Category_Intense Browsing,Time Spent Category_Moderate Browsing
7190,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
6840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3712,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4490,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
# merge encoded dummy with data frame 
data_model = data_filled.join(dummies)

# Modeling

## Preparation

In [None]:
# show data type of previous data frame
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 40 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9240 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9240 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

In [None]:
# show columns of previous data frame
data_filled.columns

Index(['Prospect ID', 'Lead Number', 'Lead Origin', 'Lead Source',
       'Do Not Email', 'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'What is your current occupation',
       'What matters most to you in choosing a product', 'Search', 'Magazine',
       'Newspaper Article', 'Madugital Telegram', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Products', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity',
       'Time Spent Category', 'Clipped TotalVisits',
       'Clipped Total Ti

In [None]:
# remove irrelevant column in final data frame
data_model = data_model.drop(['Prospect ID', 'Lead Number','Lead Origin','Lead Source','Do Not Email','Do Not Call','Last Activity','Country','Specialization','What is your current occupation','What matters most to you in choosing a product','Search','Magazine','Newspaper Article','Madugital Telegram','Newspaper','Digital Advertisement','Through Recommendations','Receive More Updates About Our Products','Tags','Lead Quality','Update me on Supply Chain Content','Get updates on DM Content','Lead Profile','City','Asymmetrique Activity Index','Asymmetrique Profile Index','I agree to pay the amount through cheque','A free copy of Mastering The Interview','Last Notable Activity','Time Spent Category','TotalVisits','Total Time Spent on Website','Page Views Per Visit'], 1)

  


In [None]:
# show data type of final data frame (there shouldn't be object data type appear)
data_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Columns: 176 entries, Converted to Time Spent Category_Moderate Browsing
dtypes: float64(4), int64(2), uint8(170)
memory usage: 1.9 MB


In [None]:
# define training & testing variable (train = 70% and test = 30%)
train, test = train_test_split(data_model, test_size = 0.3, random_state = 2021)

## Execution

In [None]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 40 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Prospect ID                                     9240 non-null   object 
 1   Lead Number                                     9240 non-null   int64  
 2   Lead Origin                                     9240 non-null   object 
 3   Lead Source                                     9240 non-null   object 
 4   Do Not Email                                    9240 non-null   object 
 5   Do Not Call                                     9240 non-null   object 
 6   Converted                                       9240 non-null   int64  
 7   TotalVisits                                     9240 non-null   float64
 8   Total Time Spent on Website                     9240 non-null   int64  
 9   Page Views Per Visit                     

In [None]:
# define prediction & target variable for training
pred_train = train.drop(['Converted'], 1)
target_train = train['Converted']
# define prediction & target variable for testing
pred_test = test.drop(['Converted'], 1)
target_test = test['Converted']

  
  """


In [None]:
# training model using DecisionTreeClassifier
decision = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=2021)
decision.fit(pred_train, target_train)

DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=2021)

In [None]:
# show prediction variable for testing
pred_test[2:3]

Unnamed: 0,Asymmetrique Activity Score,Asymmetrique Profile Score,Clipped TotalVisits,Clipped Total Time Spent on Website,Clipped Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Blog,...,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_Resubscribed to emails,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked,Time Spent Category_Intense Browsing,Time Spent Category_Moderate Browsing
9209,14.0,18.0,1.0,1226,1.0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [None]:
# test modeling prediction
decision.predict(pred_test[2:3])

array([0])

In [None]:
# check modeling prediction
test.loc[test['Converted'] == 0]

Unnamed: 0,Converted,Asymmetrique Activity Score,Asymmetrique Profile Score,Clipped TotalVisits,Clipped Total Time Spent on Website,Clipped Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,...,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_Resubscribed to emails,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked,Time Spent Category_Intense Browsing,Time Spent Category_Moderate Browsing
9209,0,14.000000,18.000000,1.0,1226,1.0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
5243,0,14.306252,16.344883,2.0,161,2.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7179,0,12.000000,17.000000,4.0,228,4.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1688,0,14.000000,15.000000,2.0,11,2.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
778,0,14.000000,17.000000,3.0,121,3.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5570,0,14.306252,16.344883,0.0,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,0,14.000000,15.000000,2.0,602,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5256,0,14.306252,16.344883,3.0,248,3.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
7406,0,13.000000,15.000000,3.0,1920,1.5,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


# Evaluation

In [None]:
# define predictor variable for evaluation
predictor = decision.predict(pred_test)

In [None]:
# model's accuracy score
accuracy_score(target_test, predictor)

0.911976911976912

In [None]:
# model's precision score
precision_score(target_test, predictor)

0.9254783484390735

In [None]:
# model's recall score
recall_score(target_test, predictor)

0.8438934802571166