In [3]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv('synthetic_leads_dataset_large.csv')

# Encode categorical feature
le = LabelEncoder()
df['industry_encoded'] = le.fit_transform(df['industry'])

# Define features and target
features = ["contacted", "demo_given", "employee_count", "annual_revenue", "industry_encoded"]
X = df[features]
y = df['converted']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Train KMeans on entire scaled dataset
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)

# Analyze clusters on training data
import numpy as np

# Convert X_train back to DataFrame for analysis
X_train_df = pd.DataFrame(X_train, columns=features)

# Predict clusters on training data
train_clusters = kmeans.predict(X_train)

# Create DataFrame for analysis
train_analysis = X_train_df.copy()
train_analysis['cluster'] = train_clusters
train_analysis['converted'] = y_train.values

# Calculate conversion rate per cluster
cluster_conversion_rates = train_analysis.groupby('cluster')['converted'].mean()
print("Conversion rate by cluster on training data:")
print(cluster_conversion_rates)

# Identify cluster with highest conversion rate
best_cluster = cluster_conversion_rates.idxmax()
best_conversion_rate = cluster_conversion_rates.max()
print(f"\nCluster with highest conversion rate: {best_cluster} ({best_conversion_rate:.2%})")

# Analyze feature means for best cluster vs overall mean
overall_mean = X_train_df.mean()
best_cluster_mean = train_analysis[train_analysis['cluster'] == best_cluster][features].mean()

# Find significant feature differences (10% threshold)
threshold = 0.1 * overall_mean.abs()

feature_diffs = []
for feat in features:
    diff = best_cluster_mean[feat] - overall_mean[feat]
    if abs(diff) > threshold[feat]:
        direction = "higher" if diff > 0 else "lower"
        feature_diffs.append(f"{direction} {feat}")

import json

cluster_descriptions = {}

for cluster_num in range(kmeans.n_clusters):
    cluster_data = train_analysis[train_analysis['cluster'] == cluster_num]
    conv_rate = cluster_data['converted'].mean()
    feature_means = cluster_data[features].mean()
    overall_means = train_analysis[features].mean()
    threshold = 0.1 * overall_means.abs()

    diffs = []
    for feat in features:
        diff = feature_means[feat] - overall_means[feat]
        if abs(diff) > threshold[feat]:
            direction = "higher" if diff > 0 else "lower"
            diffs.append(f"{direction} {feat}")

    desc = f"Conversion rate: {conv_rate:.2%}. Leads tend to have " + ", ".join(diffs) if diffs else "typical feature values."
    cluster_descriptions[cluster_num] = desc

with open("cluster_descriptions.json", "w") as f:
    json.dump(cluster_descriptions, f)

print("Cluster descriptions saved to cluster_descriptions.json")
print(f"\nKey characteristics of cluster {best_cluster} with highest conversion rate:")
print(", ".join(feature_diffs) if feature_diffs else "No significant feature differences found.")

# Save models and test set for later use
joblib.dump(rf, 'model.pkl')
joblib.dump(kmeans, 'kmeans.pkl')
pd.DataFrame(X_test, columns=features).to_csv('X_test.csv', index=False)


Conversion rate by cluster on training data:
cluster
0    0.500864
1    0.000000
2    0.000000
Name: converted, dtype: float64

Cluster with highest conversion rate: 0 (50.09%)
Cluster descriptions saved to cluster_descriptions.json

Key characteristics of cluster 0 with highest conversion rate:
higher contacted, higher demo_given, lower employee_count, higher annual_revenue, higher industry_encoded


In [4]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.12-py3-none-any.whl (26 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m88.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (

In [6]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import google.generativeai as genai

st.set_page_config(page_title="Lead Scoring Dashboard", layout="wide")

# Load models and data
model = joblib.load("model.pkl")
kmeans = joblib.load("kmeans.pkl")
X = pd.read_csv("X_test.csv")

# Predict clusters
cluster_labels = kmeans.predict(X)
df_with_clusters = X.copy()
df_with_clusters['cluster'] = cluster_labels

# Title
st.title("🔍 AI-Powered Lead Scoring Dashboard")

# Sidebar
lead_idx = st.sidebar.selectbox("Select Lead Index", range(len(X)))
lead = X.iloc[lead_idx:lead_idx+1]
lead_cluster = cluster_labels[lead_idx]
prediction = model.predict_proba(lead)[0][1]

st.sidebar.markdown(f"""
**Prediction:** `{prediction:.2f}`
**Cluster:** `Cluster {lead_cluster}`
""")

# Main area
st.subheader(f"📊 Lead #{lead_idx} Prediction Summary")
st.markdown(f"""
- 🎯 **Predicted Conversion Probability:** `{prediction:.2f}`
- 🧬 **Assigned Cluster:** `Cluster {lead_cluster}`
""")

# Lead feature values
st.subheader("🔎 Lead Feature Values")
st.dataframe(lead.T, use_container_width=True)

# Cluster summary
st.subheader("🧠 Cluster Analysis")
col1, col2 = st.columns(2)

with col1:
    st.markdown("**🔹 Mean Feature Values by Cluster**")
    st.dataframe(df_with_clusters.groupby("cluster").mean().round(2))

with col2:
    st.markdown("**🔸 Cluster Distribution**")
    fig, ax = plt.subplots()
    sns.countplot(x="cluster", data=df_with_clusters, ax=ax)
    ax.set_title("Number of Leads per Cluster")
    st.pyplot(fig)

# Google GenAI (Gemini) for personalized message
st.subheader("💡 Google GenAI ")
use_genai = st.checkbox("Generate a personalized message for this lead using GenAI?")
import google.generativeai as genai

with open("cluster_descriptions.json", "r") as f:
    cluster_descriptions = json.load(f)

if use_genai:
    # Replace with your actual key
    genai.configure(api_key="AIzaSyBiwkASZfqswK5Mon86kOpt9MuSUft9las")
    model_genai = genai.GenerativeModel("gemini-2.5-pro")

    cluster_desc = cluster_descriptions.get(str(lead_cluster), "No description available.")

# Create your prompt incorporating the cluster description
    prompt = f"""
    You are a marketing expert. This lead belongs to cluster {lead_cluster} with the following characteristics:
    {cluster_desc}
    The predicted conversion probability for this lead is {prediction:.2f}.
    Generate a short, compelling personalized message to engage this lead.
    """

    response = model_genai.generate_content(f"generate a short, compelling personalized message to engage this lead:\n{prompt}")

    st.markdown("**📢 Suggested Message:**")
    st.success(response.text if hasattr(response, 'text') else "No response.")


Writing app.py


In [8]:
!rm /root/.config/ngrok/ngrok.yml
from pyngrok import ngrok
!ngrok authtoken '30RJjlYWNfx0mkfwxuJ8wXNlXvU_5XxfD5fQ7V969PK6tY8FD'

# Kill existing tunnels (if any)
ngrok.kill()

# Run Streamlit app in background

get_ipython().system_raw('streamlit run app.py &')

# Open tunnel
public_url = ngrok.connect(8501, "http")
print(f"Streamlit app URL: {public_url}")


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit app URL: NgrokTunnel: "https://fef36c808b28.ngrok-free.app" -> "http://localhost:8501"
