# Lead Score Flow Notebook (No DataIngestion)
Clean pipeline: load CSVs â†’ preprocess â†’ feature engineer â†’ train model â†’ score leads.

## ðŸ“¥ Load Raw CSV Files

In [1]:
import os, sys

PROJECT_ROOT = r"C:\Users\sanuv\OneDrive\Desktop\CRM2\CRM2.2"

os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)

In [4]:

import os, sys
import pandas as pd

print("Current working directory:", os.getcwd())

# Load raw CSVs directly
df_train = pd.read_csv("data/raw/historical_leads.csv")
df_new   = pd.read_csv("data/raw/new_leads.csv")

print(df_train.shape)
print(df_new.shape)
df_train.head()

Current working directory: C:\Users\sanuv\OneDrive\Desktop\CRM2\CRM2.2
(1500, 23)
(400, 22)


Unnamed: 0,lead_id,email,company_name,industry,job_title,seniority_level,company_size,country,email_opens,email_clicks,...,content_downloads,form_submissions,event_attendance,demo_requests,pricing_page_views,feature_page_views,days_since_first_interaction,total_interactions,last_interaction_days,converted
0,LEAD_1,bradyjohn@example.org,"Hardin, Blair and Allen",Healthcare,Print production planner,Executive,1-10,UK,17,4,...,3,2,0,0,4,2,117,49,25,0
1,LEAD_2,brendaleon@example.org,"Lee, Taylor and Barnes",Healthcare,English as a second language teacher,Senior,1-10,Germany,10,8,...,0,1,0,1,4,6,35,8,11,0
2,LEAD_3,juan59@example.net,Miller-Barnes,Healthcare,Medical laboratory scientific officer,Senior,1000+,India,1,1,...,3,2,1,0,1,6,87,40,10,0
3,LEAD_4,dawn44@example.org,Ellis Inc,Education,"Teacher, special educational needs",Mid,51-200,India,15,5,...,4,0,1,0,3,9,94,30,3,0
4,LEAD_5,valvarez@example.org,Carter Group,Finance,Psychiatrist,Entry,51-200,UK,2,0,...,4,0,0,1,2,7,31,56,14,0


## ðŸ›  Ensure Required Columns

In [5]:

def ensure_required_columns(df):
    df['interaction_frequency'] = df['total_interactions'] / (df['days_since_first_interaction'] + 1)

    defaults = {
        'time_on_site': 0,
        'pages_viewed': 0,
        'content_downloads': 0,
        'form_submissions': 0,
        'event_attendance': 0,
        'demo_requests': 0,
        'pricing_page_views': 0,
        'feature_page_views': 0,
    }
    for col, val in defaults.items():
        if col not in df.columns:
            df[col] = val
    return df

df_train_pre = ensure_required_columns(df_train.copy())
df_new_pre   = ensure_required_columns(df_new.copy())

df_train_pre.head()


Unnamed: 0,lead_id,email,company_name,industry,job_title,seniority_level,company_size,country,email_opens,email_clicks,...,form_submissions,event_attendance,demo_requests,pricing_page_views,feature_page_views,days_since_first_interaction,total_interactions,last_interaction_days,converted,interaction_frequency
0,LEAD_1,bradyjohn@example.org,"Hardin, Blair and Allen",Healthcare,Print production planner,Executive,1-10,UK,17,4,...,2,0,0,4,2,117,49,25,0,0.415254
1,LEAD_2,brendaleon@example.org,"Lee, Taylor and Barnes",Healthcare,English as a second language teacher,Senior,1-10,Germany,10,8,...,1,0,1,4,6,35,8,11,0,0.222222
2,LEAD_3,juan59@example.net,Miller-Barnes,Healthcare,Medical laboratory scientific officer,Senior,1000+,India,1,1,...,2,1,0,1,6,87,40,10,0,0.454545
3,LEAD_4,dawn44@example.org,Ellis Inc,Education,"Teacher, special educational needs",Mid,51-200,India,15,5,...,0,1,0,3,9,94,30,3,0,0.315789
4,LEAD_5,valvarez@example.org,Carter Group,Finance,Psychiatrist,Entry,51-200,UK,2,0,...,0,0,1,2,7,31,56,14,0,1.75


## ðŸ§¬ Feature Engineering

In [6]:

from src.models.feature_engineering import FeatureEngineer

fe = FeatureEngineer()

df_train_fe, feature_cols = fe.prepare_data(df_train_pre, fit=True, scale=True)
df_new_fe, _ = fe.prepare_data(df_new_pre, fit=False, scale=True)

print("Number of engineered features:", len(feature_cols))
df_train_fe.head()


Number of engineered features: 31


Unnamed: 0,lead_id,email,company_name,industry,job_title,seniority_level,company_size,country,email_opens,email_clicks,...,engagement_depth,engagement_consistency,seniority_numeric,company_size_numeric,is_decision_maker,industry_encoded,is_major_market,content_engagement_ratio,has_demo_request,pricing_interest
0,LEAD_1,bradyjohn@example.org,"Hardin, Blair and Allen",Healthcare,Print production planner,Executive,1-10,UK,1.313434,-0.163842,...,0.792546,-0.193274,1.301758,-1.423345,0,0.041016,1,-0.233141,0,1
1,LEAD_2,brendaleon@example.org,"Lee, Taylor and Barnes",Healthcare,English as a second language teacher,Senior,1-10,Germany,0.090714,1.222693,...,-0.651541,-0.286324,0.391862,-1.423345,0,0.041016,1,-0.450862,1,1
2,LEAD_3,juan59@example.net,Miller-Barnes,Healthcare,Medical laboratory scientific officer,Senior,1000+,India,-1.481354,-1.203743,...,-0.990929,-0.449162,0.391862,1.38959,0,0.041016,0,0.057153,0,1
3,LEAD_4,dawn44@example.org,Ellis Inc,Education,"Teacher, special educational needs",Mid,51-200,India,0.964085,0.182792,...,0.403584,-0.253494,-0.518034,-0.016878,0,-1.357247,0,-0.188661,0,1
4,LEAD_5,valvarez@example.org,Carter Group,Finance,Psychiatrist,Entry,51-200,UK,-1.30668,-1.550377,...,0.312357,-0.443346,-1.42793,-0.016878,0,-0.658116,1,-0.063803,1,1


## ðŸ¤– Train Model

In [7]:

from src.models.model_trainer import ModelTrainer

X_train = df_train_fe[feature_cols]
y_train = df_train_fe['converted']

trainer = ModelTrainer()
metrics = trainer.train(X_train, y_train, use_smote=True)
metrics


Applied SMOTE: Training samples: 2022
Training xgboost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



MODEL EVALUATION RESULTS

Model Version: 20251203_121149
Algorithm: xgboost

Accuracy:  0.8367
Precision: 0.3750
Recall:    0.0638
F1 Score:  0.1091
ROC AUC:   0.5432

CV F1 Score: 0.8467 (+/- 0.1672)

--------------------------------------------------
Classification Report:
--------------------------------------------------
               precision    recall  f1-score   support

Not Converted       0.85      0.98      0.91       253
    Converted       0.38      0.06      0.11        47

     accuracy                           0.84       300
    macro avg       0.61      0.52      0.51       300
 weighted avg       0.78      0.84      0.78       300


--------------------------------------------------
Confusion Matrix:
--------------------------------------------------
True Negatives:  248
False Positives: 5
False Negatives: 44
True Positives:  3

--------------------------------------------------
Top 10 Important Features:
--------------------------------------------------
         

{'model_version': '20251203_121149',
 'algorithm': 'xgboost',
 'train_accuracy': 1.0,
 'test_accuracy': 0.8366666666666667,
 'accuracy': 0.8366666666666667,
 'precision': 0.375,
 'recall': 0.06382978723404255,
 'f1_score': 0.10909090909090909,
 'roc_auc': 0.5431839206122278,
 'features_used': 'email_opens,email_clicks,website_visits,time_on_site,pages_viewed,content_downloads,form_submissions,event_attendance,demo_requests,pricing_page_views,feature_page_views,days_since_first_interaction,total_interactions,last_interaction_days,interaction_frequency,engagement_score,intent_score,recency_score,avg_interactions_per_day,engagement_depth,engagement_consistency,seniority_numeric,company_size_numeric,industry_encoded,content_engagement_ratio,is_recent,high_frequency,is_decision_maker,is_major_market,has_demo_request,pricing_interest',
 'cv_f1_mean': np.float64(0.8467226186426293),
 'cv_f1_std': np.float64(0.16717803761223682)}

In [8]:

trainer.save_model("models/lead_scorer.pkl")
"Model saved â†’ models/lead_scorer.pkl"


âœ“ Model saved to models/lead_scorer.pkl


'Model saved â†’ models/lead_scorer.pkl'

## ðŸŽ¯ Score New Leads

In [None]:

from src.models.scorer import LeadScorer

scorer = LeadScorer("models/lead_scorer.pkl")

X_new = df_new_fe[feature_cols]
scored_df = scorer.score_leads(df_new_fe, X_new, method="hybrid")
scored_df.head()


TypeError: unsupported operand type(s) for &: 'float' and 'float'

In [None]:

import os
os.makedirs("data/scored", exist_ok=True)
scored_df.to_csv("data/scored/scored_leads_notebook.csv", index=False)
"Saved â†’ data/scored/scored_leads_notebook.csv"
