In [None]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# Load data
customers = pd.read_csv("/content/synthetic_customers.csv")
transactions = pd.read_csv("/content/synthetic_transactions.csv")
clickstream = pd.read_csv("/content/synthetic_clickstream.csv")
reviews = pd.read_csv("/content/synthetic_reviews.csv")

def prepare_features(customers, transactions, clickstream, reviews):
    # Transactions
    transactions["Revenue"] = transactions["Quantity"] * transactions["Price"]
    txn = transactions.groupby("CustomerID").agg(
        total_transactions=("TransactionID", "count"),
        total_revenue=("Revenue", "sum"),
        avg_order_value=("Revenue", "mean")
    )

    # Clickstream
    click = clickstream.groupby("CustomerID").agg(
        total_sessions=("SessionID", "count"),
        avg_session_duration=("Duration", "mean")
    )

    # Reviews
    rev = reviews.groupby("CustomerID").agg(
        avg_rating=("Rating", "mean"),
        review_count=("ReviewID", "count")
    )

    # Merge
    model_data = (
        customers
        .merge(txn, on="CustomerID", how="left")
        .merge(click, on="CustomerID", how="left")
        .merge(rev, on="CustomerID", how="left")
    )

    model_data.fillna(0, inplace=True)
    return model_data


# Feature engineering
data = prepare_features(customers, transactions, clickstream, reviews)

# ---------------------
# Create CHURN label
# ---------------------
data["churn"] = (
    (data["total_sessions"] < data["total_sessions"].median()) |
    (data["total_transactions"] < data["total_transactions"].median())
).astype(int)


# Features
features = [
    "total_transactions",
    "total_revenue",
    "avg_order_value",
    "total_sessions",
    "avg_session_duration",
    "avg_rating"
]

X = data[features]
y = data["churn"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Train model
churn_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42
)
churn_model.fit(X_train, y_train)

# Evaluate
print(classification_report(y_test, churn_model.predict(X_test)))

# Save model
joblib.dump(churn_model, "churn_model.pkl")

print("✅ churn_model.pkl saved")


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       456
           1       1.00      1.00      1.00       794

    accuracy                           1.00      1250
   macro avg       1.00      1.00      1.00      1250
weighted avg       1.00      1.00      1.00      1250

✅ churn_model.pkl saved
