In [4]:
import pandas as pd
import numpy as np
from google.colab import files

In [5]:
uploaded = files.upload()
raw_path = list(uploaded.keys())[0]
df_raw = pd.read_csv(raw_path)

Saving customer_conversion_testing_dataset.csv to customer_conversion_testing_dataset.csv


In [41]:
print(df_raw.shape)
df_raw.head()

(26145, 19)


Unnamed: 0,LeadID,Age,Gender,Location,LeadSource,TimeSpent (minutes),PagesViewed,LeadStatus,EmailSent,DeviceType,ReferralSource,FormSubmissions,Downloads,CTR_ProductPage,ResponseTime (hours),FollowUpEmails,SocialMediaEngagement,PaymentHistory,Conversion (Target)
0,1,60,Female,Lahore,Organic,46,6,Hot,10,Mobile,Facebook,2,3,0.8,11,3,54,Good,0
1,2,27,Male,Lahore,Email,42,8,Warm,6,Tablet,Direct,1,1,0.4,23,2,35,No Payment,0
2,3,21,Male,Sialkot,Email,56,11,Cold,1,Tablet,Direct,5,3,0.5,8,0,34,Good,0
3,4,37,Female,Quetta,Organic,24,15,Cold,6,Tablet,Direct,1,1,0.7,10,3,59,Good,0
4,5,35,Female,Quetta,Email,39,14,Hot,10,Desktop,Direct,1,3,0.5,13,5,35,Good,0


In [42]:
rename_map = {
    "LeadID": "user_id",
    "Age": "age",
    "Gender": "gender",
    "Location": "location",
    "LeadSource": "lead_source",
    "TimeSpent (minutes)": "time_spent_min",
    "PagesViewed": "pages_viewed",
    "LeadStatus": "lead_status",
    "EmailSent": "emails_sent",
    "DeviceType": "device_type",
    "ReferralSource": "referral_source",
    "FormSubmissions": "form_submissions",
    "Downloads": "downloads",
    "CTR_ProductPage": "ctr_product_page",
    "ResponseTime (hours)": "response_time_hours",
    "FollowUpEmails": "follow_up_emails",
    "SocialMediaEngagement": "social_media_engagement",
    "PaymentHistory": "payment_history",
    "Conversion (Target)": "converted_to_paid",
}
df = df_raw.rename(columns=rename_map)


text_cols = [
    "gender",
    "location",
    "lead_source",
    "device_type",
    "referral_source",
    "lead_status",
    "payment_history",
]

for col in text_cols:
    df[col] = df[col].astype(str).str.strip()

In [43]:
lead_status_map = {"Cold": 0, "Warm": 1, "Hot": 2}
df["lead_status_score"] = df["lead_status"].map(lead_status_map)

payment_map = {"Good": 1, "No Payment": 0}
df["payment_history_good"] = df["payment_history"].map(payment_map)

df["email_intensity"] = df["emails_sent"] + df["follow_up_emails"]

df["engagement_score"] = (
    df["time_spent_min"].fillna(0)
    + df["pages_viewed"].fillna(0) * 2
    + df["form_submissions"].fillna(0) * 3
    + df["downloads"].fillna(0) * 2
    + df["social_media_engagement"].fillna(0)
)

df["is_mobile"] = (df["device_type"] == "Mobile").astype(int)
df["is_tablet"] = (df["device_type"] == "Tablet").astype(int)
df["is_desktop"] = (df["device_type"] == "Desktop").astype(int)

df["is_organic_lead"] = (df["lead_source"] == "Organic").astype(int)
df["is_email_lead"] = (df["lead_source"] == "Email").astype(int)
df["is_referral_lead"] = (df["lead_source"] == "Referral").astype(int)
df["is_social_lead"] = (df["lead_source"] == "Social Media").astype(int)

df["converted_to_paid"] = df["converted_to_paid"].astype(int)

In [44]:
feature_cols = [
    "age",
    "lead_status_score",
    "payment_history_good",
    "time_spent_min",
    "pages_viewed",
    "email_intensity",
    "form_submissions",
    "downloads",
    "ctr_product_page",
    "response_time_hours",
    "social_media_engagement",
    "engagement_score",
    "is_mobile",
    "is_tablet",
    "is_desktop",
    "is_organic_lead",
    "is_email_lead",
    "is_referral_lead",
    "is_social_lead",
]

model_df = df[["user_id"] + feature_cols + ["converted_to_paid"]].copy()

processed_path_full = "customer_conversion_processed_full.csv" #For referring to raw data
processed_path_model = "customer_conversion_model_dataset.csv" #For training and tessting

df.to_csv(processed_path_full, index=False)
model_df.to_csv(processed_path_model, index=False)

print("\nSaved:")
print(" - Full processed dataset ->", df.shape)
print(" - Modeling dataset       ->", model_df.shape)


Saved:
 - Full processed dataset -> (26145, 30)
 - Modeling dataset       -> (26145, 21)


In [45]:
# Overall conversion rate
conv_rate = model_df["converted_to_paid"].mean()
print(f"Overall conversion rate: {conv_rate:.3f} ({conv_rate*100:.1f}%)")

# Basic summary of numeric columns
model_df.describe()

Overall conversion rate: 0.016 (1.6%)


Unnamed: 0,user_id,age,lead_status_score,payment_history_good,time_spent_min,pages_viewed,email_intensity,form_submissions,downloads,ctr_product_page,...,social_media_engagement,engagement_score,is_mobile,is_tablet,is_desktop,is_organic_lead,is_email_lead,is_referral_lead,is_social_lead,converted_to_paid
count,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,...,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0,26145.0
mean,13073.0,39.945917,1.004972,0.498068,32.424326,8.487856,9.976324,2.508625,1.496539,0.448774,...,109.632893,169.551884,0.334519,0.335819,0.329662,0.250143,0.249684,0.248652,0.25152,0.015797
std,7547.55573,11.844566,0.815982,0.500006,16.197994,4.026251,4.470529,1.710835,1.120084,0.205753,...,52.362223,55.649038,0.471831,0.472285,0.470099,0.433104,0.432839,0.43224,0.433895,0.12469
min,1.0,20.0,0.0,0.0,5.0,2.0,0.0,0.0,0.0,0.1,...,20.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6537.0,30.0,0.0,0.0,18.0,5.0,7.0,1.0,0.0,0.3,...,64.0,124.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,13073.0,40.0,1.0,0.0,32.0,8.0,10.0,3.0,1.0,0.4,...,109.0,169.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,19609.0,50.0,2.0,1.0,47.0,12.0,13.0,4.0,2.0,0.6,...,155.0,215.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
max,26145.0,60.0,2.0,1.0,60.0,15.0,20.0,5.0,3.0,0.8,...,200.0,303.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
from sklearn.model_selection import train_test_split

# Identify feature columns automatically
feature_cols = [c for c in model_df.columns if c not in ["user_id", "converted_to_paid"]]

X = model_df[feature_cols].copy()
y = model_df["converted_to_paid"].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((20916, 19), (5229, 19))

In [47]:
#Logistic Regression baseline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_proba_lr = log_reg.predict_proba(X_test)[:, 1]
y_pred_lr = (y_proba_lr >= 0.5).astype(int)

print("Logistic Regression AUC:", roc_auc_score(y_test, y_proba_lr))
print("\nClassification report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))

Logistic Regression AUC: 0.9119634386750265

Classification report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5146
           1       0.00      0.00      0.00        83

    accuracy                           0.98      5229
   macro avg       0.49      0.50      0.50      5229
weighted avg       0.97      0.98      0.98      5229



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
#Positive coef → higher conversion probability, negative → lower
coef_df = pd.DataFrame({
    "feature": feature_cols,
    "coef": log_reg.coef_[0]
}).sort_values("coef", ascending=False)

coef_df.head(10), coef_df.tail(10)

(                    feature      coef
 1         lead_status_score  0.461626
 4              pages_viewed  0.407079
 8          ctr_product_page  0.167437
 5           email_intensity  0.145880
 11         engagement_score  0.064290
 9       response_time_hours -0.005795
 0                       age -0.026277
 3            time_spent_min -0.046108
 10  social_media_engagement -0.065193
 6          form_submissions -0.126200,
                  feature      coef
 6       form_submissions -0.126200
 7              downloads -0.129983
 2   payment_history_good -0.218481
 15       is_organic_lead -1.339925
 16         is_email_lead -1.419673
 17      is_referral_lead -1.492405
 18        is_social_lead -1.507107
 12             is_mobile -1.895736
 14            is_desktop -1.925043
 13             is_tablet -1.938331)

In [49]:
#Random Forest model + feature importance
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_proba_rf = rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_proba_rf >= 0.5).astype(int)

print("Random Forest AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nClassification report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

Random Forest AUC: 0.9732017849868186

Classification report (Random Forest):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5146
           1       0.00      0.00      0.00        83

    accuracy                           0.98      5229
   macro avg       0.49      0.50      0.50      5229
weighted avg       0.97      0.98      0.98      5229



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
#Feature Importance
importances = rf.feature_importances_
fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": importances
}).sort_values("importance", ascending=False)

fi_df.head(15)

Unnamed: 0,feature,importance
4,pages_viewed,0.134618
0,age,0.12633
5,email_intensity,0.105827
3,time_spent_min,0.105432
11,engagement_score,0.101525
10,social_media_engagement,0.096493
9,response_time_hours,0.073611
8,ctr_product_page,0.048041
6,form_submissions,0.042615
1,lead_status_score,0.036597


In [51]:
fi_df.to_csv("feature_importance_random_forest.csv", index=False)

In [52]:
import joblib

joblib.dump(rf, "rf_conversion_model_final.pkl")
joblib.dump(feature_cols, "rf_feature_cols_final.pkl")

['rf_feature_cols_final.pkl']