### Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import random
from sklearn import preprocessing
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Adjust settings

In [2]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', -1)
random.seed(10)

#### Problem statement <br><br> Customers visit different websites -> Customers sign up for a course and are called leads -> Leads contacted by marketing team -> Some leads get converted to paying customers. <br><br> Aim is to identify if a lead will be converted to a paying customer.

### Function: Verify class imbalance

In [3]:
def class_imbalance(df):
    
    print("Number of samples per class :\n")
    print(df["Converted"].value_counts())

    plt.title("Number of samples per class")
    df["Converted"].value_counts().plot.bar(color = "orange")
    plt.xlabel("Class")
    plt.ylabel("Number of samples")
    plt.show()

    class_0 = df["Converted"].value_counts()[0]
    class_1 = df["Converted"].value_counts()[1]
    sum = class_0 + class_1
    print("Class ratio (class_0/class_1):\n", round(class_0/class_1, 3))

SyntaxError: invalid syntax (<ipython-input-3-3f45dca2991c>, line 15)

### Function: Drop unwanted columns

In [None]:
def drop_cols(df, col_list):
    
    df.drop(col_list, axis=1, inplace = True)

### Function: Handle missing values

In [None]:
def handle_missing_values(df, col, key_word):
    
    if key_word == "mode":
        mode_value = df[col].mode()[0]
        df[col].fillna(value = mode_value, inplace = True)
    
    elif key_word == "median":
        median_value = df[col].median()
        df[col].fillna(value = median_value, inplace = True)
    
    elif key_word == "mean":
        mean_value = df[col].mean()
        df[col].fillna(value = mean_value, inplace = True)
    
    else:
        val = key_word        
        df[col].fillna(value = val, inplace = True)

### Function: Visualise impact of feature

In [None]:
def visualise_feature_impact(df, feature, f_type):
    
    if f_type == "categorical":
        
        val = df.groupby('Converted')[feature].value_counts()
        
        fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (15,5))
        col = ["crimson", "green"]
        
        for i in range(0,2):
            val[i].plot(kind='bar',ax=axes[i],  color=col[i])
            
            axes[i].set_title(feature + " - distribution for class "+ str(i), fontsize=12)
            axes[i].set_ylabel("Count / Percentage", fontsize=12)
            axes[i].set_xlabel(feature, fontsize=12)
    
            totals = []
            for k in axes[i].patches:
                totals.append(k.get_height())

            total = sum(totals)
            for j in axes[i].patches:
                axes[i].text(j.get_x(), j.get_height(), \
                             str((j.get_height())) + "  (" + str(round((j.get_height()/total)*100, 2))+'%)', \
                             fontsize=11, color='black')
        
    elif f_type == "numerical":
        
        temp_df = pd.DataFrame(df[[feature,"Converted"]].copy())
        temp_df = remove_outliers(temp_df, feature)
        print("\n")
        print("Original number of samples :", df[feature].shape)
        print("Number of samples after outliers are removed :", temp_df[feature].shape)
        
        plt.figure(figsize = (12,5))
        sns.histplot(data=temp_df, x=feature, kde=True, hue = "Converted")
        plt.title(feature + " - distribution at class lavel")
        plt.show()

### Function: Remove outliers

In [None]:
def remove_outliers(df, col):
    
    perc_25 = np.percentile(df[col], 25)
    perc_75 = np.percentile(df[col], 75)
    iqr = perc_75 - perc_25
    ll = perc_25 - 1.5*iqr
    ul = perc_75 + 1.5*iqr
    
    print("\n IQR {} \n Lower limit {}\n Upper limit {}".format(iqr, ll, ul))
    print("\n Minimum {} \n Maximum {} \n".format(min(df[col]), max(df[col])))
    
    if min(df[col]) < ll and max(df[col]) > ul:
        print("There are samples which violate both upper limit and lower limit")
        idx1 = np.where(df[col] < ll)
        idx2 = np.where(df[col] > ul)
        idx = np.concatenate((idx1[0], idx2[0]), axis = None)
        #print(idx)
        print("Number of outliers ", len(idx))
        df.drop(idx, axis = 0, inplace = True)
        print("Outliers have been removed")

    elif min(df[col]) < ll and max(df[col]) <= ul:
        print("There are samples which violate only lower limit")
        idx = np.where(df[col] < ll)
        #print(idx[0])
        print("Number of outliers ", len(idx[0]))
        df.drop(idx[0], axis = 0, inplace = True)
        print("Outliers have been removed")
    
    elif min(df[col]) >= ll and max(df[col]) > ul:
        print("There are samples which violate only upper limit")
        idx = np.where(df[col] > ul)
        #print(idx[0])
        print("Number of outliers ", len(idx[0]))
        df.drop(idx[0], axis = 0, inplace = True)
        print("Outliers have been removed")
        
    else:
        print("No outliers in this data")
        
    df = df.reset_index()
    df.drop(columns=["index"], axis=1, inplace = True)
    
    return df

### Function: Encode categorical variable

In [None]:
def categorical_encode(df, col, key_word):
    
    if key_word == 'l_encode':
        le = preprocessing.LabelEncoder() 
        l_encodes = pd.DataFrame(le.fit_transform(df[col]).astype(str))
        l_encodes.columns = [col + "_encoded"]
        df = pd.concat([df, l_encodes], axis = 1)
        df.drop([col], axis = 1, inplace = True)
    
    elif key_word == 'o_encode':
        o_encodes = pd.get_dummies(df[col], drop_first= True).astype(int)
        df = pd.concat([df, o_encodes], axis = 1)
        df.drop([col], axis = 1, inplace = True)
    
    else:
        print("Please specify correct key-word")
        
    return df 

### Define path and load data

In [None]:
path = "/Users/sohinimitra/Documents/Kaggle/datasets/kaggle-lead-scoring-datasets/"

In [None]:
data = pd.read_csv(path + "Leads.csv")
data.sample(3)

### Data description

In [None]:
data_descripton = pd.read_excel(path + "Leads Data Dictionary.xlsx")
data_descripton

#### Note: From the data description it can be observed that the target variable is the column "Converted". Different features have been captured to understand the behavior of a lead getting converted to a paying customer. 

### Data overview

In [None]:
print("Shape of dataset :", data.shape)
print("\n")
print(data.info())

#### Note: There are 9240 samples in this dataset. Most of the features are categorical in nature. There are several NULL values across different features in the dataset. 

### Check for class imbalance

In [None]:
class_imbalance(data)

#### Note: Overall there is sufficent number of samples in both classes to learn from. The class ratio is less than 2. Class imbalance does not exist in the overall data.

### Identify irrelevant features

In [None]:
data.nunique()

#### Note: <br> 1. Prospect ID and Lead Number are identifiers for each customer, hence can be avoided for training. <br> 2.  Activity/Profile Index/Score are redundant information, either Index or Score can be avoided for training. (Drop Score since Index privides infromation at a group level as Low-Medium-High) <br> 3. Last Activity and Last Notable Activity features are redundant, one of them can be avoided for training. (Drop Notable Activity)

In [None]:
col_list = ["Prospect ID","Lead Number","Asymmetrique Activity Score","Asymmetrique Profile Score","Last Activity"]
drop_cols(data, col_list)
print("Shape of dataset :",data.shape)
data.sample(3)

### Identify number of NaNs for each feature

In [None]:
data.isna().sum()

#### Note: <br> 1. Lead Quality has high number of NULL values. This feature will be dropped. <br> 2. Activity and Profile indexes will be retained despite high number of NULLs since they indicate behavior of a customer. <br> 3. Number of NULLs for Lead Source, TotalVisits and Page Views Per Visit is low, NULLs will be handled by using "Mode"/"Median"/"Mean".

In [None]:
col_list = ["Lead Quality"]
drop_cols(data, col_list)

In [None]:
handle_missing_values(data, "Lead Source", "mode")
handle_missing_values(data, "TotalVisits", "mean")
handle_missing_values(data, "Page Views Per Visit", "mean")
data.sample(3)

###  Identify impact of features on target variable

In [None]:
visualise_feature_impact(data, "Lead Origin", "categorical")
data["Lead Origin"].value_counts()

#### Note: <br> 1. If a lead arrives via "Lead Add Form", possibility of coversion is higher. This feature impacts the target variable. <br> 2. There is only one sample for "Quick Add Form". This sample may be ignored. 

In [None]:
visualise_feature_impact(data, "Lead Source", "categorical")
data["Lead Source"].value_counts()

#### Note: <br> 1. "Google"-"google" represent the same source, this can be combined. <br> 2. Several sources have very few sample, such samples can be combined into a single category. 

In [None]:
visualise_feature_impact(data, "Do Not Email", "categorical")
data["Do Not Email"].value_counts()

#### Note: Most of the values in this feature is "No". This may not have an impact on the target variable. 

In [None]:
visualise_feature_impact(data, "Do Not Call", "categorical")
data["Do Not Call"].value_counts()

#### Note: Most of the values in this feature is "No". This will not have an impact on the target variable. 

In [None]:
visualise_feature_impact(data, "TotalVisits", "numerical")

#### Note: As the number of total visits increase, the chance of conversion increases. 

In [None]:
visualise_feature_impact(data, "Total Time Spent on Website", "numerical")

#### Note: As the time spent on the website increases, the chnace of conversion to a paying customer increases. 

In [None]:
visualise_feature_impact(data, "Page Views Per Visit", "numerical")

#### Note: As the number of page views increases, the possibility of conversion is higher. 

In [None]:
visualise_feature_impact(data, "Country", "categorical")
data["Country"].value_counts()

#### Note: Most of the values in this feature is "Inda". This will not have an impact on the target variable. 

In [None]:
visualise_feature_impact(data, "Specialization", "categorical")
data["Specialization"].value_counts()

#### Note: <br> 1. Many featuers fall under "Select", i.e. value not specified by user. <br> 2. This may have an impact on the target variable.

In [None]:
visualise_feature_impact(data, "How did you hear about X Education", "categorical")
data["How did you hear about X Education"].value_counts()

#### Note: <br> 1. Many featuers fall under "Select", i.e. value not specified by user. <br> 2. This may have an impact on the target variable.

In [None]:
visualise_feature_impact(data, "What is your current occupation", "categorical")
data["What is your current occupation"].value_counts()

#### Note: A working professional is most likely to get converted to a paying customer. This feature will impact the output. 

In [None]:
visualise_feature_impact(data, "What matters most to you in choosing a course", "categorical")
data["What matters most to you in choosing a course"].value_counts()

#### Note: 99% of the values are "Better career prospects". This feature will not be useful for learning. 

In [None]:
visualise_feature_impact(data, "Search", "categorical")
data["Search"].value_counts()

#### Note: Variety of information in this column is very less. This will not impact the target variable. 

In [None]:
visualise_feature_impact(data, "Magazine", "categorical")
data["Magazine"].value_counts()

#### This feature can be ignored since there is only one type of value. 

In [None]:
visualise_feature_impact(data, "Newspaper Article", "categorical")
data["Newspaper Article"].value_counts()

#### Note: This feature will not impact the output variable since 99% of the values is same.

In [None]:
visualise_feature_impact(data, "X Education Forums", "categorical")
data["X Education Forums"].value_counts()

#### Note: This feature will not impact the output variable since more than 99% of the values is same.

In [None]:
visualise_feature_impact(data, "Newspaper", "categorical")
data["Newspaper"].value_counts()

#### Note: This feature will not impact the output variable since more than 99% of the values is same.

In [None]:
visualise_feature_impact(data, "Digital Advertisement", "categorical")
data["Digital Advertisement"].value_counts()

#### Note: This feature will not impact the output variable since 99% of the values is same.

In [None]:
visualise_feature_impact(data, "Through Recommendations", "categorical")
data["Through Recommendations"].value_counts()

#### Note: This feature will not impact the output variable since 99% of the values is same.

In [None]:
visualise_feature_impact(data, "Receive More Updates About Our Courses", "categorical")
data["Receive More Updates About Our Courses"].value_counts()

#### Note: This feature will not impact the output variable since all the values is same.

In [None]:
visualise_feature_impact(data, "Tags", "categorical")
data["Tags"].value_counts()

#### Note: <br> 1. This feature has imapct on target variable based on it's values. <br> 2. Number of samples for certain features are less, which can be combined into a separate category. 

In [None]:
visualise_feature_impact(data, "Update me on Supply Chain Content", "categorical")
data["Update me on Supply Chain Content"].value_counts()

#### Note: This feature will not impact the output variable since all the values is same.

In [None]:
visualise_feature_impact(data, "Get updates on DM Content", "categorical")
data["Get updates on DM Content"].value_counts()

#### Note: This feature will not impact the output variable since all the values is same.

In [None]:
visualise_feature_impact(data, "Lead Profile", "categorical")
data["Lead Profile"].value_counts()

#### Note: Several values are unkown and placed in "Select". This may have an impact on the target variable. 

In [None]:
visualise_feature_impact(data, "City", "categorical")
data["City"].value_counts()

#### Note: This may have an impact on the output.

In [None]:
visualise_feature_impact(data, "Asymmetrique Activity Index", "categorical")
data["Asymmetrique Activity Index"].value_counts()

#### Note: This may have an impact on the target variable. 

In [None]:
visualise_feature_impact(data, "Asymmetrique Profile Index", "categorical")
data["Asymmetrique Profile Index"].value_counts()

#### Note: A high profile index is likely to convert to a paying customer. 

In [None]:
visualise_feature_impact(data, "I agree to pay the amount through cheque", "categorical")
data["I agree to pay the amount through cheque"].value_counts()

#### Note: This feature will not impact the output variable since all the values is same.

In [None]:
visualise_feature_impact(data, "A free copy of Mastering The Interview", "categorical")
data["A free copy of Mastering The Interview"].value_counts()

#### Note: This may have an impact on the target variable. 

In [None]:
visualise_feature_impact(data, "Last Notable Activity", "categorical")
data["Last Notable Activity"].value_counts()

#### Note: This feature has an imapct on the target variable. Certain categories have fewer features which can be combined to a separate group like "Others". 

### Dropping irrelevant features

In [None]:
col_list = ["Do Not Call",  "Country", "What matters most to you in choosing a course", "Search", "Magazine", "Newspaper Article", "X Education Forums", "Newspaper", "Digital Advertisement", "Through Recommendations", "Receive More Updates About Our Courses", "Update me on Supply Chain Content", "Get updates on DM Content", "I agree to pay the amount through cheque"]
drop_cols(data, col_list)
print("Shape of dataset :",data.shape)
data.sample(3)

#### Note: <br> 1. Specialization, How did you hear about X Education, City NULLs can be replaced by "Select", depicting the same information that value is not known, or these columns may be dropped. <br> 2. What is your current occupation, Tags NULLs can be addressed by categorising as a separate group. <br> 3. Lead Profile feature has 4k+ "Select" values, i.e. unknown and further ~3k NULLs. This feature can be ignored. <br> 4. Activity/Profile index NULLs can be addressed by categorising them as "Unknown".

In [None]:
col_list = ["Lead Profile"]
drop_cols(data, col_list)
print("Shape of dataset :",data.shape)

In [None]:
handle_missing_values(data, "Specialization", "mode")
handle_missing_values(data, "How did you hear about X Education", "mode")
handle_missing_values(data, "City", "Select")
handle_missing_values(data, "What is your current occupation", "Unknown_Occupation")
handle_missing_values(data, "Tags", "Unknown_Tag")
handle_missing_values(data, "Asymmetrique Activity Index", "Unknown_Activity")
handle_missing_values(data, "Asymmetrique Profile Index", "Unknown_Profile")

In [None]:
data.sample(5)

### Cleaning features

In [None]:
data = data[data["Lead Origin"] != "Quick Add Form"]

data["Lead Origin"].value_counts()

In [None]:
data["Lead Source"] = data["Lead Source"].str.lower()
val_list = ["bing","click2call", "social media", "live chat", "press_release", "pay per click ads", "welearnblog_home", "blog", "youtubechannel", "welearn", "testone", "nc_edm"]
data["Lead Source"] = data["Lead Source"].apply(lambda x: 'other_source' if x in val_list else x) 

#data["Lead Source"].value_counts()

In [None]:
val_list = ["in touch with EINS", "Lost to Others", "Want to take admission but has financial problems", "Still Thinking", "Interested in Next batch", "In confusion whether part time or DLP", "Lateral student", "Shall take in the next coming month", "University not recognized", "Recognition issue (DEC approval)"]
data["Tags"] = data["Tags"].apply(lambda x: 'other_tags' if x in val_list else x) 

#data["Tags"].value_counts()

In [None]:
val_list = ["Had a Phone Conversation", "Email Marked Spam", "Resubscribed to emails", "View in browser link Clicked","Email Received", "Form Submitted on Website", "Approached upfront"]
data["Last Notable Activity"] = data["Last Notable Activity"].apply(lambda x: 'other_last_activity' if x in val_list else x) 

#data["Last Notable Activity"].value_counts()

In [None]:
data["Asymmetrique Activity Index"] = data["Asymmetrique Activity Index"].str.replace(".","_" )
data["Asymmetrique Profile Index"] = data["Asymmetrique Profile Index"].str.replace(".","_" )

In [None]:
data["Asymmetrique Activity Index"].value_counts()

### Handling categorical features

In [None]:
data_dummy = data.copy()
data_dummy = categorical_encode(data_dummy, "Lead Origin", "o_encode")
data_dummy = categorical_encode(data_dummy, "Lead Source", "o_encode")
data_dummy = categorical_encode(data_dummy, "Specialization", "o_encode")
data_dummy = categorical_encode(data_dummy, "How did you hear about X Education", "o_encode")
data_dummy = categorical_encode(data_dummy, "What is your current occupation", "o_encode")
data_dummy = categorical_encode(data_dummy, "Tags", "o_encode")
data_dummy = categorical_encode(data_dummy, "City", "o_encode")
data_dummy = categorical_encode(data_dummy, "Last Notable Activity", "o_encode")

#data = categorical_encode(data, "Do Not Email", "l_encode")
#data = categorical_encode(data, "A free copy of Mastering The Interview", "l_encode")
data_dummy["Do Not Email"] = data_dummy["Do Not Email"].replace(['No','Yes'],[0,1])
data_dummy["A free copy of Mastering The Interview"] = data_dummy["A free copy of Mastering The Interview"].replace(['No','Yes'],[0,1])
data_dummy["Asymmetrique Activity Index"] = data_dummy["Asymmetrique Activity Index"].replace(['Unknown_Activity','03_Low', '02_Medium', '01_High'],[0,1,2,3])
data_dummy["Asymmetrique Profile Index"] = data_dummy["Asymmetrique Profile Index"].replace(['Unknown_Profile','03_Low', '02_Medium', '01_High'],[0,1,2,3])

print("Shape of dataset :",data_dummy.shape)
data_dummy.sample(5)

In [None]:
data.sample(5)

### Feature selection using Mutual Information 

In [None]:
features = data_dummy.drop(["Converted"], axis = 1)
target = data_dummy["Converted"]

MI = mutual_info_classif(np.array(features), np.array(target), discrete_features=True)
MI_data = pd.concat([pd.DataFrame(features.columns),pd.DataFrame(MI)], axis = 1)
MI_data.columns = ["Features", "Mutual Information"]
MI_data.sort_values(by = "Mutual Information", ascending = False)

#### Note: From MI scores, the following features will be considered for modeling: <br> 1. Total Time Spent on Website <br> 2. Tags <br> 3. Last Notable Activity <br> 4. What is your current occupation <br> 5. Lead Origin <br> 6. Page Views per visit <br> 7. Total visits <br> 8. Lead Source <br> 9. Asymmetrique Activity Index <br> 10. Do Not Email <br> 11. Asymmetrique Profile Index 

In [None]:
data_f = data[["Lead Origin", "Lead Source", "What is your current occupation", "Tags", "Last Notable Activity", "Do Not Email", "Asymmetrique Activity Index", "Asymmetrique Profile Index", "Total Time Spent on Website", "Page Views Per Visit", "TotalVisits", "Converted"]]


In [None]:
data_f = categorical_encode(data_f, "Lead Origin", "o_encode")
data_f = categorical_encode(data_f, "Lead Source", "o_encode")
data_f = categorical_encode(data_f, "What is your current occupation", "o_encode")
data_f = categorical_encode(data_f, "Tags", "o_encode")
data_f = categorical_encode(data_f, "Last Notable Activity", "o_encode")

#data = categorical_encode(data, "Do Not Email", "l_encode")
#data = categorical_encode(data, "A free copy of Mastering The Interview", "l_encode")
data_f["Do Not Email"] = data_f["Do Not Email"].replace(['No','Yes'],[0,1])
data_f["Asymmetrique Activity Index"] = data_f["Asymmetrique Activity Index"].replace(['Unknown_Activity','03_Low', '02_Medium', '01_High'],[0,1,2,3])
data_f["Asymmetrique Profile Index"] = data_f["Asymmetrique Profile Index"].replace(['Unknown_Profile','03_Low', '02_Medium', '01_High'],[0,1,2,3])

print("Shape of dataset :",data_f.shape)
data_f.sample(5)

### Train-test datasets

In [None]:
X = data_f.drop(["Converted"], axis = 1)
Y = data_f["Converted"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=999)

train_mean = x_train.mean()
train_std = x_train.std()
x_train_scaled = (x_train - train_mean)/train_std
x_test_scaled = (x_test - train_mean)/train_std

### Modeling

In [None]:
lr = LogisticRegression()
print("Cross validation scores:\n",cross_val_score(lr, x_train_scaled, y_train, cv=10))

lr.fit(x_train_scaled, y_train)
pred_vals = lr.predict(x_test_scaled)

pred_vals_df = pd.DataFrame(pred_vals)

### Model performance evaluation

In [None]:
conf = confusion_matrix(y_train,pd.DataFrame(lr.predict(x_train_scaled)))
label = [0,1]
plt.figure(figsize=(7,5))
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label,cmap='Blues', fmt='g')
plt.title("Confusion matrix for train data")
plt.xlabel("Predicted labels")
plt.ylabel("Actual labels")
plt.show()

In [None]:
print("Classification report for train data: \n")
print(classification_report(y_train,pd.DataFrame(lr.predict(x_train_scaled)), labels=label))

In [None]:
conf = confusion_matrix(y_test,pred_vals_df)
label = [0,1]
plt.figure(figsize=(7,5))
sns.heatmap(conf, annot=True, xticklabels=label, yticklabels=label,cmap='Blues', fmt='g')
plt.title("Confusion matrix for test data")
plt.xlabel("Predicted labels")
plt.ylabel("Actual labels")
plt.show()

In [None]:
print("Classification report for test data: \n")
print(classification_report(y_test,pred_vals_df, labels=label))

In [None]:
print("Overall model accuracy for train data : ", round(metrics.accuracy_score(y_train,pd.DataFrame(lr.predict(x_train_scaled))),3))
print("Overall model accuracy for test data : ", round(metrics.accuracy_score(y_test,pred_vals_df),3))

In [None]:
fpr_tr, tpr_tr, _ = metrics.roc_curve(y_train,pd.DataFrame(lr.predict(x_train_scaled)))
auc_tr = round(metrics.roc_auc_score(y_train,pd.DataFrame(lr.predict(x_train_scaled))),3)

fpr_tt, tpr_tt, _ = metrics.roc_curve(y_test, pred_vals_df)
auc_tt = round(metrics.roc_auc_score(y_test, pred_vals_df),3)

print("AUC score train and test : ",auc_tr, auc_tt)

plt.figure(figsize=(7,5))
plt.title("ROC-AUC curve")
plt.plot(fpr_tr,tpr_tr, label="auc_tr="+str(auc_tr))
plt.plot(fpr_tt,tpr_tt, label="auc_tt="+str(auc_tt))
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.legend(loc=4)
plt.show()

### Note: Model performance can be improved through parameter tuning, using different algorithms etc. 