In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pickle


In [13]:

# --- Load & prep ---
df = pd.read_excel(r"C:\Users\SUSHIL KUMAR\Desktop\retailsales_analysis\data\Online Retail.xlsx")
df = df.dropna(subset=['CustomerID'])
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(int)
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']



KeyboardInterrupt: 

In [3]:
# Snapshot and RFM features
snapshot = df['InvoiceDate'].max() + pd.Timedelta(days=1)
cust = df.groupby('CustomerID').agg(
    LastPurchase=('InvoiceDate', 'max'),
    Frequency=('InvoiceNo', 'nunique'),
    Monetary=('TotalPrice', 'sum'),
    QuantitySum=('Quantity', 'sum'),
    AvgBasket=('TotalPrice', 'mean'),
    Countries=('Country', lambda x: x.mode().iat[0] if not x.mode().empty else 'Unknown')
).reset_index()

cust['Recency'] = (snapshot - cust['LastPurchase']).dt.days



In [4]:
# Label churn: inactive > 180 days
cust['Churn'] = (cust['Recency'] > 180).astype(int)



In [5]:
# One-hot for top countries (keep top 5 to avoid sparse dummies)
top_countries = cust['Countries'].value_counts().head(5).index
cust['Countries'] = np.where(cust['Countries'].isin(top_countries), cust['Countries'], 'Other')
X = cust[['Recency','Frequency','Monetary','QuantitySum','AvgBasket','Countries']]
X = pd.get_dummies(X, columns=['Countries'], drop_first=True)
y = cust['Churn']



In [6]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)



In [8]:
# Scale for LR
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


In [10]:

# Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train_s, y_train)
y_pred_lr = lr.predict(X_test_s)
print("Logistic Regression")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))



Logistic Regression
[[870   0]
 [  0 215]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       870
           1       1.00      1.00      1.00       215

    accuracy                           1.00      1085
   macro avg       1.00      1.00      1.00      1085
weighted avg       1.00      1.00      1.00      1085



In [11]:
# Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))



Random Forest
[[870   0]
 [  1 214]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       870
           1       1.00      1.00      1.00       215

    accuracy                           1.00      1085
   macro avg       1.00      1.00      1.00      1085
weighted avg       1.00      1.00      1.00      1085



In [12]:
# Save models
with open(r"C:\Users\SUSHIL KUMAR\Desktop\retailsales_analysis\models\churn_logistic_model.pkl", "wb") as f:
    pickle.dump({'model': lr, 'scaler': scaler, 'features': X.columns.tolist()}, f)

with open(r"C:\Users\SUSHIL KUMAR\Desktop\retailsales_analysis\models\churn_random_forest.pkl", "wb") as f:
    pickle.dump({'model': rf, 'features': X.columns.tolist()}, f)

print("Saved: ../models/churn_logistic_model.pkl, ../models/churn_random_forest.pkl")


Saved: ../models/churn_logistic_model.pkl, ../models/churn_random_forest.pkl
