In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/Projects/Sales/Targeting_right_customer.csv')

df = data.copy()

# Drop specific columns
columns_to_drop = ['Prospect ID','Lead Number', 'Asymmetrique Activity Index', 'Asymmetrique Profile Score', 'Asymmetrique Profile Index', 'Asymmetrique Activity Score']
df.drop(columns=columns_to_drop, inplace=True)

columns_with_nulls = df.columns[df.isnull().any()].tolist()

columns_to_fill = ['Lead Quality', 'Lead Source', 'TotalVisits', 'Page Views Per Visit', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Tags', 'Lead Profile', 'City']

for column_name in columns_to_fill:
    # Calculating percentage counts for each sub-category
    percentage_counts = df[column_name].value_counts(normalize=True)

    # Finding null indices in the current column
    null_indices = df[df[column_name].isnull()].index

    # Filling null values based on percentage counts
    df.loc[null_indices, column_name] = np.random.choice(
        percentage_counts.index,
        size=len(null_indices),
        replace=True,
        p=percentage_counts.values
    )

df.isnull().sum().sort_values(ascending = False)/ len(df) * 100

columns_to_drop = ['Lead Profile']
df.drop(columns=columns_to_drop, inplace=True)

columns_to_drop = ['How did you hear about X Education']
df.drop(columns=columns_to_drop, inplace=True)

df['Specialization'] = df['Specialization'].replace('Select', 'No Specialization')

df['City'] = df['City'].replace('Select', 'Mumbai')

columns_to_check = ['TotalVisits', 'Page Views Per Visit', 'Last Activity', 'Lead Source']
df.dropna(subset=columns_to_check, inplace=True)

df = df.drop(['What matters most to you in choosing a course','Search','Magazine','Newspaper Article','X Education Forums','Newspaper',
           'Digital Advertisement','Through Recommendations','Receive More Updates About Our Courses','Update me on Supply Chain Content',
           'Get updates on DM Content','I agree to pay the amount through cheque','A free copy of Mastering The Interview','Country'],1)


In [2]:

categorical_features = ['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Specialization', 'What is your current occupation', 'Tags', 'Lead Quality', 'City', 'Last Notable Activity']

# Applying one-hot encoding to the categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [3]:
df = df_encoded.copy()

In [4]:
df

Unnamed: 0,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Direct Traffic,Lead Source_Facebook,...,Last Notable Activity_Form Submitted on Website,Last Notable Activity_Had a Phone Conversation,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_Resubscribed to emails,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked
0,0,0.0,0,0.00,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,5.0,674,2.50,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2.0,1532,2.00,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1.0,305,1.00,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,1,2.0,1428,1.00,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,1,8.0,1845,2.67,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9236,0,2.0,238,2.00,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
9237,0,2.0,199,2.00,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
9238,1,3.0,499,3.00,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [None]:
df.columns

In [6]:
y = df['Converted'].astype(int)  # Target variable
X = df.drop(columns=['Converted'])  # Features

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# standardization 

from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

X_standardized = standard_scaler.fit_transform(X)

In [9]:
#model training and prediction

models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("NB", GaussianNB()))
models.append(("RF", RandomForestClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")

LR Accuracy: 0.872834 (0.010773)
Training Accuracy for LR: 0.8812229437229437

Test Accuracy for LR: 0.8852813852813853
Confusion Matrix for LR:
[[1011   96]
 [ 116  625]]

Classification Report for LR:
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1107
           1       0.87      0.84      0.85       741

    accuracy                           0.89      1848
   macro avg       0.88      0.88      0.88      1848
weighted avg       0.88      0.89      0.89      1848


Recall for LR: 0.8434547908232118

Precision for LR: 0.8668515950069348

ROC AUC for LR: 0.8783669618072698
----------------------------------------------------------------------------
LDA Accuracy: 0.874188 (0.006659)
Training Accuracy for LDA: 0.8794642857142857

Test Accuracy for LDA: 0.8858225108225108
Confusion Matrix for LDA:
[[1009   98]
 [ 113  628]]

Classification Report for LDA:
              precision    recall  f1-score   support

           0       0.