In [1]:
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 1.9/1.9 MB 13.3 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.1.0


In [106]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker for generating random data
faker = Faker()

# Define parameters for synthetic data
n_samples = 5000  # Number of rows in the dataset
np.random.seed(42)  # For reproducibility

# Generate anonymized behavioral features
data = {
    "session_id": [faker.uuid4() for _ in range(n_samples)],  # Unique session IDs
    "click_count": np.random.poisson(10, n_samples),  # Number of clicks in a session
    "scroll_depth": np.random.uniform(0.1, 1.0, n_samples),  # Scroll depth (0 to 1)
    "dwell_time": np.random.exponential(300, n_samples),  # Time spent on site (seconds)
    "device_type": np.random.choice(["mobile", "desktop", "tablet"], n_samples),  # Device type
    "browser": np.random.choice(["Chrome", "Firefox", "Safari", "Edge"], n_samples),  # Browser type
    'session_duration': np.random.exponential(scale=120, size=n_samples),
}

# Generate demographic labels (age bracket, gender, affluence)
data["age_group"] = np.random.choice(["18-24", "25-34", "35-44", "45-54", "55+"], n_samples)
data["gender"] = np.random.choice(["male", "female"], n_samples)
data["affluence"] = np.random.choice(["low", "medium", "high"], n_samples)

# Generate behavioral personas (e.g., browsing style, purchase intent)
data["browsing_style"] = np.random.choice(
    ["scanner", "deep_reader", "explorer"], n_samples,
    p=[0.5, 0.3, 0.2]  # Probabilities for each persona type
)

# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)

# Save to CSV (optional)
df.to_csv("synthetic_behavioral_data.csv", index=False)

# Display the first few rows of the dataset
print(df.head())


                             session_id  click_count  scroll_depth  \
0  652ad803-e8cb-4c92-841c-079a9c976a77           12      0.583884   
1  618a094c-f2f5-45af-b8bc-548d4ba8629c            6      0.633795   
2  416395fd-5392-44c0-97e2-fff56da22ac2           11      0.589141   
3  e2da8f4b-90fc-449d-abc0-0a61d6a9dc87           14      0.189151   
4  80f57a9e-3fc6-4ade-ab36-9da9c143dab0            7      0.386919   

   dwell_time device_type  browser  session_duration age_group  gender  \
0  333.379533     desktop  Firefox        115.950899       55+    male   
1  394.739556      mobile     Edge        466.373325     25-34    male   
2  501.092183     desktop     Edge         44.936362     35-44    male   
3  458.683433      mobile     Edge        375.734567       55+  female   
4  343.884017      tablet   Safari         19.323722     45-54  female   

  affluence browsing_style  
0      high        scanner  
1       low       explorer  
2       low        scanner  
3       low       

In [107]:
df['Engagement_score']=df['dwell_time']*df['scroll_depth']
df['Engagement_score'].head()

0    194.654848
1    250.184099
2    295.213705
3     86.760334
4    133.055097
Name: Engagement_score, dtype: float64

In [108]:
df['click_rate'] = df['click_count'] / (df['session_duration'] + 1)
df['click_rate'].head()

0    0.102607
1    0.012838
2    0.239462
3    0.037161
4    0.344425
Name: click_rate, dtype: float64

In [109]:
df = df[df['dwell_time'] < 300]
df.head()

Unnamed: 0,session_id,click_count,scroll_depth,dwell_time,device_type,browser,session_duration,age_group,gender,affluence,browsing_style,Engagement_score,click_rate
5,67d8ba89-725f-42d5-a313-84f1d33f0b23,8,0.146143,173.764281,desktop,Firefox,173.271557,55+,male,medium,explorer,25.394485,0.045905
7,fc718835-9d5c-4027-9044-dd8bcf2790af,11,0.269739,252.545109,desktop,Firefox,203.724142,55+,female,high,scanner,68.121275,0.053731
8,5ab3f4c8-5c22-4469-9eae-8350f57ec46a,8,0.605469,67.98332,tablet,Firefox,3.492606,55+,male,medium,explorer,41.161812,1.780704
9,3e9336e4-e515-4b3b-8011-cec63f22047f,10,0.723528,103.937984,tablet,Safari,9.762263,18-24,female,medium,deep_reader,75.201995,0.929173
12,450c091c-b7eb-493b-bcd1-ceb17eb53b6a,11,0.4486,59.067401,tablet,Safari,110.838957,25-34,male,medium,explorer,26.497641,0.098356


In [110]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = [ 'Engagement_score']
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head(5)

Unnamed: 0,session_id,click_count,scroll_depth,dwell_time,device_type,browser,session_duration,age_group,gender,affluence,browsing_style,Engagement_score,click_rate
5,67d8ba89-725f-42d5-a313-84f1d33f0b23,8,0.146143,173.764281,desktop,Firefox,173.271557,55+,male,medium,explorer,0.085192,0.045905
7,fc718835-9d5c-4027-9044-dd8bcf2790af,11,0.269739,252.545109,desktop,Firefox,203.724142,55+,female,high,scanner,0.228548,0.053731
8,5ab3f4c8-5c22-4469-9eae-8350f57ec46a,8,0.605469,67.98332,tablet,Firefox,3.492606,55+,male,medium,explorer,0.138094,1.780704
9,3e9336e4-e515-4b3b-8011-cec63f22047f,10,0.723528,103.937984,tablet,Safari,9.762263,18-24,female,medium,deep_reader,0.252305,0.929173
12,450c091c-b7eb-493b-bcd1-ceb17eb53b6a,11,0.4486,59.067401,tablet,Safari,110.838957,25-34,male,medium,explorer,0.088894,0.098356


In [111]:
def label_persona(row):
    if row['scroll_depth'] > 0.8 and row['dwell_time'] > 90:
        return "High Intent"
    elif row['click_rate'] > 1.0 and row['scroll_depth'] < 30:
        return "Window Shopper"
    else:
        return "Casual Visitor"

df['persona'] = df.apply(label_persona, axis=1)

In [112]:
df['persona'].value_counts()['Casual Visitor']

2479

In [113]:
df['persona'].value_counts()['Window Shopper']

168

In [114]:
df['persona'].value_counts()['High Intent']


416

In [115]:
df.head()

Unnamed: 0,session_id,click_count,scroll_depth,dwell_time,device_type,browser,session_duration,age_group,gender,affluence,browsing_style,Engagement_score,click_rate,persona
5,67d8ba89-725f-42d5-a313-84f1d33f0b23,8,0.146143,173.764281,desktop,Firefox,173.271557,55+,male,medium,explorer,0.085192,0.045905,Casual Visitor
7,fc718835-9d5c-4027-9044-dd8bcf2790af,11,0.269739,252.545109,desktop,Firefox,203.724142,55+,female,high,scanner,0.228548,0.053731,Casual Visitor
8,5ab3f4c8-5c22-4469-9eae-8350f57ec46a,8,0.605469,67.98332,tablet,Firefox,3.492606,55+,male,medium,explorer,0.138094,1.780704,Window Shopper
9,3e9336e4-e515-4b3b-8011-cec63f22047f,10,0.723528,103.937984,tablet,Safari,9.762263,18-24,female,medium,deep_reader,0.252305,0.929173,Casual Visitor
12,450c091c-b7eb-493b-bcd1-ceb17eb53b6a,11,0.4486,59.067401,tablet,Safari,110.838957,25-34,male,medium,explorer,0.088894,0.098356,Casual Visitor


In [116]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


# === 2. Define features and targets ===
features = ['device_type', 'browser', 'scroll_depth', 'dwell_time', 'click_rate', 'session_duration']

X = df[features]
y_age = df['age_group']
y_affluence = df['affluence']

# === 3. Preprocessing ===
num_features = ['scroll_depth', 'dwell_time', 'click_rate', 'session_duration']
cat_features = ['device_type', 'browser']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# === 4. Random Forest for Age Group Prediction ===
age_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y_age, test_size=0.2, stratify=y_age, random_state=42)

age_pipeline.fit(X_train, y_train)

print("Age Group RF Accuracy:", age_pipeline.score(X_test, y_test))

joblib.dump(age_pipeline, 'models/rf_age_group.pkl')

# === 5. Random Forest for Affluence Prediction ===
affluence_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y_affluence, test_size=0.2, stratify=y_affluence, random_state=42)

affluence_pipeline.fit(X_train, y_train)

print("Affluence RF Accuracy:", affluence_pipeline.score(X_test, y_test))

joblib.dump(affluence_pipeline, 'models/rf_affluence.pkl')

# === 6. KMeans Clustering for Behavior Segments ===
X_processed = preprocessor.fit_transform(X)  # same preprocessing

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_processed)

df['behavior_cluster'] = clusters
joblib.dump(kmeans, 'models/kmeans_behavior.pkl')
joblib.dump(preprocessor, 'models/preprocessor.pkl')  # reuse later


Age Group RF Accuracy: 0.2137030995106036
Affluence RF Accuracy: 0.35073409461663946


['models/preprocessor.pkl']

In [117]:
# View cluster centers (in scaled + encoded space)
import numpy as np

for cluster_id in range(3):
    print(f"Cluster {cluster_id} stats:")
    print(df[df['behavior_cluster'] == cluster_id][num_features].mean())
    print("-" * 30)

# Assign manually based on browsing behavior patterns
cluster_to_persona = {
    0: "Window Shopper",
    1: "Deal Seeker",
    2: "High-Intent Buyer"
}

df['persona'] = df['behavior_cluster'].map(cluster_to_persona)

# Save persona map
joblib.dump(cluster_to_persona, 'models/persona_map.pkl')


Cluster 0 stats:
scroll_depth          0.574227
dwell_time          213.010481
click_rate            0.248807
session_duration    122.167536
dtype: float64
------------------------------
Cluster 1 stats:
scroll_depth          0.535493
dwell_time           64.312701
click_rate            0.225340
session_duration    120.334287
dtype: float64
------------------------------
Cluster 2 stats:
scroll_depth          0.583824
dwell_time          129.222942
click_rate            4.472161
session_duration      1.933485
dtype: float64
------------------------------


['models/persona_map.pkl']

In [120]:

import joblib
import pandas as pd

# Load models
age_model = joblib.load('models/rf_age_group.pkl')
affluence_model = joblib.load('models/rf_affluence.pkl')
kmeans_model = joblib.load('models/kmeans_behavior.pkl')
preprocessor = joblib.load('models/preprocessor.pkl')
persona_map = joblib.load('models/persona_map.pkl')

def predict_user_segment(user_data: dict):
    """
    Input user_data:
    {
        "device_type": "mobile",
        "browser": "chrome",
        "scroll_depth": 65.2,
        "dwell_time": 14.5,
        "click_rate": 0.3,
        "session_duration": 120,
        "pages_viewed": 5
    }
    """
    df = pd.DataFrame([user_data])

    # Transform input
    X_processed = preprocessor.transform(df)

    # Predict
    age_group = age_model.predict(df)[0]
    affluence = affluence_model.predict(df)[0]
    behavior_cluster = kmeans_model.predict(X_processed)[0]
    persona = persona_map.get(behavior_cluster, "Unknown")

    return {
        "age_group": age_group,       
        "affluence": affluence,
        "behavior_cluster": int(behavior_cluster),
        "persona": persona
    }


In [121]:
user = {
    "device_type": "desktop",
    "browser": "firefox",
    "scroll_depth": 72.3,
    "dwell_time": 18.5,
    "click_rate": 0.45,
    "session_duration": 155,
}

print(predict_user_segment(user))


{'age_group': '18-24', 'affluence': 'medium', 'behavior_cluster': 0, 'persona': 'Window Shopper'}
