# Predictive Lead Conversion Using Metadata

This notebook uses your cleaned dataset to build a predictive model for lead conversion. Follow the steps below for data analysis, model building, and evaluation.

**Workflow:**
1. Load the cleaned data
2. Exploratory Data Analysis (EDA)
3. Feature preparation
4. Train/test split and scaling
5. Model training and evaluation
6. Save the trained model


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc
import joblib

In [None]:
df = pd.read_csv('Leads.csv')
df.head()

In [None]:
print('Shape:', df.shape)
df.info()
print(df.describe())

In [None]:
print('Missing values per column:')
print(df.isnull().sum())

In [None]:
# Fill numeric columns with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].median())
# Fill categorical columns with 'Unknown'
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna('Unknown')

In [None]:
plt.figure(figsize=(5,3))
sns.countplot(x='Converted', data=df)
plt.title('Target Variable Distribution (Converted)')
plt.show()

In [None]:
if 'TotalVisits' in df.columns and 'Page Views Per Visit' in df.columns:
    df['Total_Interactions'] = df['TotalVisits'] + df['Page Views Per Visit']
if 'Tags' in df.columns:
    df['Num_Tags'] = df['Tags'].apply(lambda x: len(str(x).split(',')) if x != 'Not Provided' else 0)

In [None]:
y = df['Converted']
X = df.drop('Converted', axis=1)
X_encoded = pd.get_dummies(X, drop_first=True)
print('Encoded feature shape:', X_encoded.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print('Train shape:', X_train_scaled.shape)
print('Test shape:', X_test_scaled.shape)

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid.fit(X_train_scaled, y_train)
print("Best Parameters:", grid.best_params_)

In [None]:
best_rf = RandomForestClassifier(**grid.best_params_, random_state=42)
best_rf.fit(X_train_scaled, y_train)
y_pred = best_rf.predict(X_test_scaled)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("✅ Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
y_pred = best_rf.predict(X_test_scaled)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("✅ Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
feature_importance_df.to_csv("Feature_Importance_Prioritized.csv", index=False)

plt.figure(figsize=(14, 10))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title("Feature Importance (High to Low)", fontsize=18)
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
y_pred_prob = best_rf.predict_proba(X_test_scaled)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest")
plt.legend()
plt.show()

In [None]:
joblib.dump(best_rf, "best_random_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X_encoded.columns, "model_columns.pkl")

In [None]:
def predict_new_lead(data_dict):
    import pandas as pd
    import joblib

    model = joblib.load("best_random_forest_model.pkl")
    scaler = joblib.load("scaler.pkl")
    columns = joblib.load("model_columns.pkl")

    df_new = pd.DataFrame([data_dict])
    df_new_encoded = pd.get_dummies(df_new)
    df_new_encoded = df_new_encoded.reindex(columns=columns, fill_value=0)
    df_new_scaled = scaler.transform(df_new_encoded)
    prediction = model.predict(df_new_scaled)
    return prediction[0]