# Virtual Lead Qualifier â€“ Lead Scoring (Final Clean Notebook)

Notebook with experimentation, cross-validation, and final model selection.

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

#Basic Setting
pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix

## 2. Load Dataset

In [2]:
df = pd.read_csv("Lead Scoring.csv")
df.head()

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Country,Specialization,How did you hear about X Education,What is your current occupation,What matters most to you in choosing a course,Search,Magazine,Newspaper Article,X Education Forums,Newspaper,Digital Advertisement,Through Recommendations,Receive More Updates About Our Courses,Tags,Lead Quality,Update me on Supply Chain Content,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.0,Page Visited on Website,,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Interested in other courses,Low in Relevance,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.5,Email Opened,India,Select,Select,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,,No,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.0,Email Opened,India,Business Administration,Select,Student,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.0,Unreachable,India,Media and Advertising,Word Of Mouth,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Ringing,Not Sure,No,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.0,Converted to Lead,India,Select,Other,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,Will revert after reading the email,Might be,No,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified


In [3]:
df.isnull().sum()

Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

In [4]:
df['Lead Quality'].mode()

0    Might be
Name: Lead Quality, dtype: object

## 3. Data Cleaning & Leakage Control

In [5]:
df.drop(columns=["Prospect ID", "Lead Number"], inplace=True)

num_cols = [
    "TotalVisits", "Page Views Per Visit",
    "Asymmetrique Activity Score", "Asymmetrique Profile Score"
]
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

cat_cols = [
    "Last Activity", "Country", "Specialization",
    "How did you hear about X Education",
    "What is your current occupation",
    "What matters most to you in choosing a course",
    "Lead Profile", "City"
]
for col in cat_cols:
    df[col].fillna("Unknown", inplace=True)

df["Lead Quality"] = df["Lead Quality"].fillna(
    df["Lead Quality"].mode()[0]
)


## 4. Binary Encoding

In [6]:
binary_cols = [
    "Do Not Email", "Do Not Call", "Search", "Magazine",
    "Newspaper Article", "X Education Forums", "Newspaper",
    "Digital Advertisement", "Through Recommendations",
    "Receive More Updates About Our Courses",
    "Update me on Supply Chain Content", "Get updates on DM Content",
    "I agree to pay the amount through cheque",
    "A free copy of Mastering The Interview",
    "Lead Quality"
]

for col in binary_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.lower()
        .map({"yes": 1, "no": 0})
        .fillna(0)   # ðŸ‘ˆ future-proof line
    )

## 5. Lead Source Grouping

In [7]:
df["Lead Source"].fillna(df["Lead Source"].mode()[0], inplace=True)

top_sources = ["Google", "Direct Traffic", "Organic Search", "Facebook", "Olark Chat"]
df["Lead Source"] = df["Lead Source"].apply(lambda x: x if x in top_sources else "Other")

## 6. Intent Feature from Tags

In [8]:
def tag_group(tag):
    if pd.isna(tag):
        return "Unknown"
    elif "Interested" in tag:
        return "High Intent"
    elif tag in ["Busy", "Ringing"]:
        return "Medium Intent"
    else:
        return "Low Intent"

df["Tag_Group"] = df["Tags"].apply(tag_group)

df.drop(columns=["Tags", "Asymmetrique Activity Index", "Asymmetrique Profile Index"], inplace=True)

## 7. Trainâ€“Test Split

In [9]:
X = df.drop("Converted", axis=1)
y = df["Converted"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [10]:
X.columns

Index(['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call',
       'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit',
       'Last Activity', 'Country', 'Specialization',
       'How did you hear about X Education', 'What is your current occupation',
       'What matters most to you in choosing a course', 'Search', 'Magazine',
       'Newspaper Article', 'X Education Forums', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Courses', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity',
       'Tag_Group'],
      dtype='object')

## 8. Preprocessing Pipeline

In [11]:
categorical_cols = X.select_dtypes(include="object").columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
    ("num", StandardScaler(), numeric_cols)
])

## 9. Baseline Model â€“ Logistic Regression

In [12]:
baseline_clf = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

baseline_clf.fit(X_train, y_train)
baseline_prob = baseline_clf.predict_proba(X_test)[:, 1]

print("Baseline ROC-AUC:", roc_auc_score(y_test, baseline_prob))

Baseline ROC-AUC: 0.9380047058254646


## 10. Cross-Validation â€“ XGBoost Model Stability

In [13]:
#conda install -c conda-forge xgboost

In [14]:
#from sklearn.base import is_classifier

In [15]:
#is_classifier(xgb_clf)

## 11. Final Model Training â€“ XGBoost

In [16]:
import xgboost
from xgboost import XGBClassifier

In [17]:
from sklearn.model_selection import cross_val_predict

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scale_pos_weight = (y == 0).sum() / (y == 1).sum()

xgb_clf = Pipeline([
    ("preprocessing", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        random_state=42
    ))
])


xgb_clf.fit(X_train,y_train)

xgb_clf_prob=xgb_clf.predict_proba(X_test)[:,1]

print("XGB ROC AUC:",roc_auc_score(y_test,xgb_clf_prob))

# Get CV probabilities
y_cv_prob = cross_val_predict(
    xgb_clf,
    X,
    y,
    cv=cv,
    method="predict_proba"
)[:, 1]

# Computed ROC-AUC manually
roc_auc = roc_auc_score(y, y_cv_prob)
print("XGBoost CV ROC-AUC:", roc_auc)


XGB ROC AUC: 0.9615920790033585
XGBoost CV ROC-AUC: 0.9634919914380312


In [18]:
xgb_clf.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [19]:
#xgb_clf_prob=xgb_clf.predict_proba(X_test)[:,1]

In [20]:
#print("XGB ROC AUC:",roc_auc_score(y_test,xgb_clf_prob))

In [21]:
print("ROC AUC :",roc_auc_score(y_test, xgb_clf.predict(X_test)))

ROC AUC : 0.8993874294455874


In [22]:
## 12. Final Evaluation

In [23]:
# from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve
# import matplotlib.pyplot as plt

# # Predict probabilities
# y_prob = xgb_final_clf.predict_proba(X_test)[:, 1]

# # ROC-AUC
# print("Final XGBoost ROC-AUC:", roc_auc_score(y_test, y_prob))

# # Confusion Matrix
# print(confusion_matrix(y_test, (y_prob >= 0.5).astype(int)))

# # Classification Report
# print(classification_report(y_test, (y_prob >= 0.5).astype(int)))

# # ROC Curve
# fpr, tpr, _ = roc_curve(y_test, y_prob)

# plt.figure(figsize=(6,5))
# plt.plot(fpr, tpr, label="XGBoost")
# plt.plot([0,1], [0,1], '--', label="Random")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve â€“ Final XGBoost Model")
# plt.legend()
# plt.show()

## 13. Lead Scoring & Bucketing

In [24]:
lead_score = pd.Series((y_prob * 100).round(0), index=y_test.index)

def lead_bucket(score):
    if score >= 80:
        return "Hot Lead"
    elif score >= 60:
        return "Warm Lead"
    else:
        return "Cold Lead"

lead_output = pd.DataFrame({
    "Lead_Score": lead_score,
    "Lead_Category": lead_score.apply(lead_bucket),
    "Actual_Converted": y_test
})

lead_output.head()
#lead_output.sort_values(by="Lead_Score", ascending=False).head()

NameError: name 'y_prob' is not defined

## 14. Save Model

In [None]:
#joblib.dump(final_clf, "lead_scoring_model.joblib")