In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib

### 📊 Generating real-time simulated dataset

In [4]:
# For reproducibility
random.seed(42)
np.random.seed(42)

# Base config
products = {
    "Laptop": (800, 1500),
    "Smartphone": (300, 1000),
    "Monitor": (150, 500),
    "Headphones": (20, 200),
    "Keyboard": (30, 150)
}
regions = ["North", "South", "East", "West"]
device_types = ["Mobile", "Desktop", "Tablet"]

In [5]:
def simulate_entry():
    product = random.choice(list(products.keys()))
    price = round(np.random.normal(*products[product]), 2)
    clicks = np.random.poisson(8)
    region = random.choice(regions)
    device = random.choices(device_types, weights=[0.6, 0.3, 0.1])[0]
    user_age = int(np.clip(np.random.normal(35, 12), 18, 70))
    session_time = max(1, np.random.exponential(scale=10))  # in minutes
    is_returning_user = random.choices([True, False], weights=[0.4, 0.6])[0]

    # Add noise and generate probabilistic target variable
    noise = np.random.normal(0, 0.2)
    value_score = (price / 1000) + (clicks / 20) + (session_time / 30) + (is_returning_user * 0.2) + noise
    value = "high" if value_score > 1.6 else "low"

    return {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "product": product,
        "price": max(10, price),
        "clicks": max(0, clicks),
        "region": region,
        "device_type": device,
        "user_age": user_age,
        "session_time": round(session_time, 2),
        "is_returning_user": is_returning_user,
        "value": value
    }


In [6]:
def generate_dataset(n=1000):
    return pd.DataFrame([simulate_entry() for _ in range(n)])

# Generate and preview
df = generate_dataset(1000)
df.tail()

Unnamed: 0,timestamp,product,price,clicks,region,device_type,user_age,session_time,is_returning_user,value
995,2025-04-14 15:35:18,Keyboard,10.0,7,East,Tablet,38,15.47,True,low
996,2025-04-14 15:35:18,Monitor,10.0,10,North,Mobile,31,3.83,False,low
997,2025-04-14 15:35:18,Laptop,1687.36,8,North,Mobile,48,2.39,False,high
998,2025-04-14 15:35:18,Keyboard,10.0,6,North,Mobile,30,6.25,False,low
999,2025-04-14 15:35:18,Monitor,10.0,7,South,Mobile,50,14.62,False,low


In [7]:
df.shape

(1000, 10)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp          1000 non-null   object 
 1   product            1000 non-null   object 
 2   price              1000 non-null   float64
 3   clicks             1000 non-null   int64  
 4   region             1000 non-null   object 
 5   device_type        1000 non-null   object 
 6   user_age           1000 non-null   int64  
 7   session_time       1000 non-null   float64
 8   is_returning_user  1000 non-null   bool   
 9   value              1000 non-null   object 
dtypes: bool(1), float64(2), int64(2), object(5)
memory usage: 71.4+ KB


In [9]:
df.duplicated().sum()

0

In [10]:
df.isnull().sum()

timestamp            0
product              0
price                0
clicks               0
region               0
device_type          0
user_age             0
session_time         0
is_returning_user    0
value                0
dtype: int64

In [11]:
df.value.value_counts()

value
low     788
high    212
Name: count, dtype: int64

In [None]:
#Saving the dataset
df.to_csv("realtime_sales.csv")

#### 🧹 Preprocessing 

In [None]:

X = df.drop(columns=["timestamp", "value"])  # Features
y = df["value"]  # Target



In [14]:
# Apply One-Hot Encoding to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('region_device', OneHotEncoder(), ['region', 'device_type']),
        ('num_features', 'passthrough', ['price', 'clicks', 'user_age', 'session_time', 'is_returning_user'])
    ])



### 🧠 Model building and training

In [15]:
# Create a pipeline (preprocessing + classification)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])



In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model using Joblib
joblib.dump(model, 'customer_value_model.joblib')
print("Model saved to 'customer_value_model.joblib'")

Accuracy: 0.9333333333333333
Classification Report:
              precision    recall  f1-score   support

        high       0.86      0.80      0.83        61
         low       0.95      0.97      0.96       239

    accuracy                           0.93       300
   macro avg       0.91      0.88      0.89       300
weighted avg       0.93      0.93      0.93       300

Model saved to 'customer_value_model.joblib'


#### 🔁 Resampling the data

In [None]:
# Apply RandomOverSampler to balance the classes in the training data
from imblearn.over_sampling import RandomOverSampler 
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Train the model on the resampled dataset
model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model using Joblib
joblib.dump(model, 'resampled_customer_value_model.joblib')
print("Model saved to 'resampled_customer_value_model.joblib'")

Accuracy: 0.9366666666666666
Classification Report:
              precision    recall  f1-score   support

        high       0.83      0.87      0.85        61
         low       0.97      0.95      0.96       239

    accuracy                           0.94       300
   macro avg       0.90      0.91      0.90       300
weighted avg       0.94      0.94      0.94       300

Model saved to 'resampled_customer_value_model.joblib'
