
# Phase 5 â€” Advanced Analytics & ML
This notebook runs K-Means RFM clustering, Linear Regression (sales prediction), Logistic Regression (churn prediction), and Decision Tree (category classification). It also computes evaluation metrics and feature importances. 
Generated automatically.


In [None]:
import pandas as pd
from pathlib import Path

# Load cleaned data
path = Path('/mnt/data/clean_data_phase2.csv')
df = pd.read_csv(path)
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'], errors='coerce')
df.head()

In [None]:
# RFM and KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
now_date = df['Purchase_Date'].max()
rfm = df.groupby('User_ID').agg({'Purchase_Date': lambda x: (now_date - x.max()).days,'User_ID':'count','Final_Price(Rs.)':'sum'}).rename(columns={'Purchase_Date':'Recency','User_ID':'Frequency','Final_Price(Rs.)':'Monetary'}).reset_index()
rfm['Monetary'] = rfm['Monetary'].replace(0,0.001)
rfm['Monetary_log'] = np.log(rfm['Monetary'])
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency','Frequency','Monetary_log']])
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10).fit(rfm_scaled)
rfm['Cluster'] = kmeans.labels_
rfm.head()

In [None]:
# Linear Regression for Final Price
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
features = ['Price (Rs.)','Discount (%)','Payment_Method_Encoded','Category_Encoded']
model_df = df.dropna(subset=['Final_Price(Rs.)','Price (Rs.)','Discount (%)'])
X = model_df[features]
y = model_df['Final_Price(Rs.)']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
lr = LinearRegression().fit(X_train,y_train)
y_pred = lr.predict(X_test)
print('RMSE:', mean_squared_error(y_test,y_pred, squared=False))
print('R2:', r2_score(y_test,y_pred))

In [None]:
# Churn prediction (proxy): no purchase in last 90 days -> churn
last_purchase = df.groupby('User_ID')['Purchase_Date'].max().reset_index().rename(columns={'Purchase_Date':'LastPurchase'})
now_date = df['Purchase_Date'].max()
last_purchase['DaysSince'] = (now_date - last_purchase['LastPurchase']).dt.days
last_purchase['Churn'] = (last_purchase['DaysSince'] > 90).astype(int)
churn_df = last_purchase.merge(rfm[['User_ID','Recency','Frequency','Monetary']], on='User_ID', how='left').fillna(0)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
Xc = churn_df[['Recency','Frequency','Monetary']]
yc = churn_df['Churn']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc,yc,test_size=0.2,random_state=42)
log = LogisticRegression(max_iter=1000).fit(Xc_train,yc_train)
yc_pred = log.predict(Xc_test)
print('Accuracy:', accuracy_score(yc_test,yc_pred))
print('Precision:', precision_score(yc_test,yc_pred, zero_division=0))
print('Recall:', recall_score(yc_test,yc_pred, zero_division=0))

In [None]:
# Decision Tree for Category classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
clf_df = df.dropna(subset=['Category_Encoded','Price (Rs.)','Final_Price(Rs.)','Discount (%)'])
Xc2 = clf_df[['Price (Rs.)','Discount (%)','Payment_Method_Encoded','Final_Price(Rs.)']]
yc2 = clf_df['Category_Encoded']
Xc2_train, Xc2_test, yc2_train, yc2_test = train_test_split(Xc2,yc2,test_size=0.2,random_state=42)
dt = DecisionTreeClassifier(max_depth=6, random_state=42).fit(Xc2_train,yc2_train)
yc2_pred = dt.predict(Xc2_test)
print('Accuracy:', accuracy_score(yc2_test,yc2_pred))
print('Precision weighted:', precision_score(yc2_test,yc2_pred, average='weighted', zero_division=0))
print('Recall weighted:', recall_score(yc2_test,yc2_pred, average='weighted', zero_division=0))

In [None]:
# Feature importance
print('Decision Tree importances:', dict(zip(Xc2.columns, dt.feature_importances_)))
print('Linear regression coefficients:', dict(zip(features, lr.coef_)))

In [None]:
import json
summary = {
  "kmeans_centers": [
    {
      "Recency": 89.84037558685446,
      "Frequency": 1.0,
      "Monetary_log": 5.45869636993016,
      "Monetary": 234.79114397503307
    },
    {
      "Recency": 149.48809523809524,
      "Frequency": 1.0,
      "Monetary_log": 3.6091209436779006,
      "Monetary": 36.93357185078258
    },
    {
      "Recency": 260.45544554455444,
      "Frequency": 1.0,
      "Monetary_log": 5.348639778073162,
      "Monetary": 210.3220185877579
    }
  ],
  "linear_regression": {
    "rmse": 20.923202152297407,
    "r2": 0.9715276365317821
  },
  "churn_model": {
    "accuracy": 1.0,
    "precision": 1.0,
    "recall": 1.0
  },
  "decision_tree": {
    "accuracy": 0.07,
    "precision_weighted": 0.10694638694638695,
    "recall_weighted": 0.07
  },
  "feature_importances_dt": {
    "Price (Rs.)": 0.24617532476847254,
    "Discount (%)": 0.07176980030405054,
    "Payment_Method_Encoded": 0.1295797214122502,
    "Final_Price(Rs.)": 0.5524751535152267
  },
  "feature_importances_lr": {
    "Price (Rs.)": 0.8000491689279354,
    "Discount (%)": -2.5044746229958634,
    "Payment_Method_Encoded": -0.9851240075372435,
    "Category_Encoded": -0.6651292481308503
  }
}
print(summary)