In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/Projects/Sales/Targeting_right_customer.csv')

df = data.copy()

# Drop specific columns
columns_to_drop = ['Prospect ID','Lead Number', 'Asymmetrique Activity Index', 'Asymmetrique Profile Score', 'Asymmetrique Profile Index', 'Asymmetrique Activity Score']
df.drop(columns=columns_to_drop, inplace=True)

columns_with_nulls = df.columns[df.isnull().any()].tolist()

columns_to_fill = ['Lead Quality', 'Lead Source', 'TotalVisits', 'Page Views Per Visit', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Tags', 'Lead Profile', 'City']

for column_name in columns_to_fill:
    # Calculating percentage counts for each sub-category
    percentage_counts = df[column_name].value_counts(normalize=True)

    # Finding null indices in the current column
    null_indices = df[df[column_name].isnull()].index

    # Filling null values based on percentage counts
    df.loc[null_indices, column_name] = np.random.choice(
        percentage_counts.index,
        size=len(null_indices),
        replace=True,
        p=percentage_counts.values
    )
    
    
columns_to_drop = ['Lead Profile']
df.drop(columns=columns_to_drop, inplace=True)

columns_to_drop = ['How did you hear about X Education']
df.drop(columns=columns_to_drop, inplace=True)

df['City'] = df['City'].replace('Select', 'Mumbai')

columns_to_check = ['TotalVisits', 'Page Views Per Visit', 'Last Activity', 'Lead Source']
df.dropna(subset=columns_to_check, inplace=True)

df = df.drop(['What matters most to you in choosing a course','Search','Magazine','Newspaper Article','X Education Forums','Newspaper',
           'Digital Advertisement','Through Recommendations','Receive More Updates About Our Courses','Update me on Supply Chain Content',
           'Get updates on DM Content','I agree to pay the amount through cheque','A free copy of Mastering The Interview','Country'],1)

# Define a list of all categorical feature names
categorical_features = ['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Specialization', 'What is your current occupation', 'Tags', 'Lead Quality', 'City', 'Last Notable Activity']

# Ensure that all specified columns are correctly labeled as categorical
for feature in categorical_features:
    df[feature] = df[feature].astype('category')
    
    
y = df['Converted'].astype(int)  # Target variable
X = df.drop(columns=['Converted'])  # Features

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

# Create a CatBoost Pool object with categorical features specified
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)

# Initialize the CatBoost model
catboost_model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, verbose=0)

# Fit the CatBoost model
catboost_model.fit(train_pool)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Calculating training accuracy for CatBoost model
y_train_pred = catboost_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy for CatBoost: {train_accuracy}")
print()

# Calculating test accuracy for CatBoost model
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy for CatBoost: {test_accuracy}")

# Calculating additional metrics for CatBoost model
print(f"Recall for CatBoost: {recall_score(y_test, y_pred)}")
print()
print(f"Precision for CatBoost: {precision_score(y_test, y_pred)}")
print()
print(f"ROC AUC for CatBoost: {roc_auc_score(y_test, y_pred)}")
print("----------------------------------------------------------------------------")

Training Accuracy for CatBoost: 0.8931277056277056

Test Accuracy for CatBoost: 0.895021645021645
Recall for CatBoost: 0.8367626886145405

Precision for CatBoost: 0.8905109489051095

ROC AUC for CatBoost: 0.8848692799641067
----------------------------------------------------------------------------


In [2]:
categorical_features

['Lead Origin',
 'Lead Source',
 'Do Not Email',
 'Do Not Call',
 'Last Activity',
 'Specialization',
 'What is your current occupation',
 'Tags',
 'Lead Quality',
 'City',
 'Last Notable Activity']

In [4]:
X.columns

Index(['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call',
       'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit',
       'Last Activity', 'Specialization', 'What is your current occupation',
       'Tags', 'Lead Quality', 'City', 'Last Notable Activity'],
      dtype='object')

In [6]:
X

Unnamed: 0,Lead Origin,Lead Source,Do Not Email,Do Not Call,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Specialization,What is your current occupation,Tags,Lead Quality,City,Last Notable Activity
0,API,Olark Chat,No,No,0.0,0,0.00,Page Visited on Website,Select,Unemployed,Interested in other courses,Low in Relevance,Mumbai,Modified
1,API,Organic Search,No,No,5.0,674,2.50,Email Opened,Select,Unemployed,Ringing,Might be,Mumbai,Email Opened
2,Landing Page Submission,Direct Traffic,No,No,2.0,1532,2.00,Email Opened,Business Administration,Student,Will revert after reading the email,Might be,Mumbai,Email Opened
3,Landing Page Submission,Direct Traffic,No,No,1.0,305,1.00,Unreachable,Media and Advertising,Unemployed,Ringing,Not Sure,Mumbai,Modified
4,Landing Page Submission,Google,No,No,2.0,1428,1.00,Converted to Lead,Select,Unemployed,Will revert after reading the email,Might be,Mumbai,Modified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,Landing Page Submission,Direct Traffic,Yes,No,8.0,1845,2.67,Email Marked Spam,IT Projects Management,Unemployed,Will revert after reading the email,High in Relevance,Mumbai,Email Marked Spam
9236,Landing Page Submission,Direct Traffic,No,No,2.0,238,2.00,SMS Sent,Media and Advertising,Unemployed,wrong number given,Might be,Mumbai,SMS Sent
9237,Landing Page Submission,Direct Traffic,Yes,No,2.0,199,2.00,SMS Sent,Business Administration,Unemployed,invalid number,Not Sure,Mumbai,SMS Sent
9238,Landing Page Submission,Google,No,No,3.0,499,3.00,SMS Sent,Human Resource Management,Unemployed,Ringing,Not Sure,Other Metro Cities,SMS Sent
