In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)

In [2]:
data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/P1/Sales Prediction/Data.csv')

In [3]:
df = data.copy()

In [4]:
df.head(7)

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified
5,2058ef08-2858-443e-a01f-a9237db2f5ce,660680,API,Olark Chat,No,No,0,0.0,0,0.0,...,No,,,01.High,02.Medium,17.0,15.0,No,No,Modified
6,9fae7df4-169d-489b-afe4-0f3d752542ed,660673,Landing Page Submission,Google,No,No,1,2.0,1640,2.0,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,No,Modified


In [5]:
# Drop specific columns
columns_to_drop = ['Prospect ID','Lead Number', 'Asymmetrique Activity Index', 'Asymmetrique Profile Score', 'Asymmetrique Profile Index', 'Asymmetrique Activity Score']
df.drop(columns=columns_to_drop, inplace=True)

In [6]:
columns_with_nulls = df.columns[df.isnull().any()].tolist()

In [7]:
columns_to_fill = ['Lead Quality', 'Lead Source', 'TotalVisits', 'Page Views Per Visit', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Tags', 'Lead Profile', 'City']

In [8]:
for column_name in columns_to_fill:
    # Calculating percentage counts for each sub-category
    percentage_counts = df[column_name].value_counts(normalize=True)

    # Finding null indices in the current column
    null_indices = df[df[column_name].isnull()].index

    # Filling null values based on percentage counts
    df.loc[null_indices, column_name] = np.random.choice(
        percentage_counts.index,
        size=len(null_indices),
        replace=True,
        p=percentage_counts.values
    )
    
    

In [9]:
columns_to_drop = ['Lead Profile']
df.drop(columns=columns_to_drop, inplace=True)

In [10]:
columns_to_drop = ['How did you hear about X Education']
df.drop(columns=columns_to_drop, inplace=True)

In [11]:
df['City'] = df['City'].replace('Select', 'Mumbai')

In [12]:
columns_to_check = ['TotalVisits', 'Page Views Per Visit', 'Last Activity', 'Lead Source']
df.dropna(subset=columns_to_check, inplace=True)

In [13]:
df = df.drop(['What matters most to you in choosing a course','Search','Magazine','Newspaper Article','X Education Forums','Newspaper',
           'Digital Advertisement','Through Recommendations','Receive More Updates About Our Courses','Update me on Supply Chain Content',
           'Get updates on DM Content','I agree to pay the amount through cheque','A free copy of Mastering The Interview','Country'],1)


In [14]:
# Define a list of all categorical feature names
categorical_features = ['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Specialization', 'What is your current occupation', 'Tags', 'Lead Quality', 'City', 'Last Notable Activity']

In [15]:
# Ensure that all specified columns are correctly labeled as categorical
for feature in categorical_features:
    df[feature] = df[feature].astype('category')

In [16]:
y = df['Converted'].astype(int)  # Target variable
X = df.drop(columns=['Converted'])  # Features

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

In [18]:
# Create a CatBoost Pool object with categorical features specified
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)

In [19]:
# Initialize the CatBoost model
catboost_model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, verbose=0)

# Fit the CatBoost model
catboost_model.fit(train_pool)

<catboost.core.CatBoostClassifier at 0x1ee17b34850>

In [20]:
# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

In [21]:
# Calculating training accuracy for CatBoost model
y_train_pred = catboost_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy for CatBoost: {train_accuracy}")
print()

Training Accuracy for CatBoost: 0.9005681818181818



In [22]:
# Calculating test accuracy for CatBoost model
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy for CatBoost: {test_accuracy}")

# Calculating additional metrics for CatBoost model
print(f"Recall for CatBoost: {recall_score(y_test, y_pred)}")
print()
print(f"Precision for CatBoost: {precision_score(y_test, y_pred)}")
print()
print(f"ROC AUC for CatBoost: {roc_auc_score(y_test, y_pred)}")
print("----------------------------------------------------------------------------")

Test Accuracy for CatBoost: 0.8896103896103896
Recall for CatBoost: 0.8104667609618105

Precision for CatBoost: 0.8911353032659409

ROC AUC for CatBoost: 0.8745585338551383
----------------------------------------------------------------------------
