In [15]:
# Importing the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [16]:
# --- 1. Load Data ---
sales_df = pd.read_csv(r"C:\Users\Dell\OneDrive\Documents\OneDrive\Desktop\sales_20k_final_FINAL.csv")
customers_df = pd.read_csv(r"C:\Users\Dell\Documents\customers_simple_ids.csv")

In [17]:
customers_df.head(10)

Unnamed: 0,customer_id,join_date
0,C1,03-08-2022 23:07
1,C2,10-03-2023 22:16
2,C3,28-05-2023 20:58
3,C4,14-03-2021 23:07
4,C5,14-09-2022 19:06
5,C6,11-03-2021 17:13
6,C7,15-02-2023 23:07
7,C8,05-01-2022 00:52
8,C9,16-06-2022 01:14
9,C10,03-05-2022 19:07


In [18]:
sales_df.head()

Unnamed: 0,sale_id,store_id,product_id,customer_id,quantity,price,discount,sale_date,revenue,customer_class
0,1,10,63,C1804,3,38,14.04,25-06-2021 08:22,97.9944,Bronze
1,2,5,70,C1352,1,157,17.37,28-04-2021 04:48,129.7291,Bronze
2,3,14,3,C1602,5,159,0.29,04-01-2022 23:36,792.6945,Bronze
3,4,11,34,C980,2,250,16.95,19-06-2022 12:18,415.25,Bronze
4,5,12,87,C1842,3,477,10.02,23-02-2021 11:02,1287.6138,Silver


In [21]:
# --- 2. Feature Engineering & Data Preparation ---

# Convert sale_date to datetime to extract time features
sales_df['sale_date'] = pd.to_datetime(sales_df['sale_date'])
print("2.1 - Datetime conversion successful")

# Aggregate sales data per customer to create 'spending behavior' features
customer_features = sales_df.groupby('customer_id').agg(
    total_sales=('sale_id', 'count'),
    total_revenue=('revenue', 'sum'),
    average_discount=('discount', 'mean'),
    max_quantity=('quantity', 'max'),
    unique_products=('product_id', 'nunique')
).reset_index()

# Merge customer features with loyalty tier data
merged_df = pd.merge(customer_features, customers_df[['customer_id']], on='customer_id', how='left')

merged_df = pd.merge(
    merged_df,
    sales_df[['customer_id', 'customer_class']],
    on='customer_id',
    how='left'
)
print("2.3 - Data merging successful.")

# Convert average discount to a categorized class (e.g., Low, Medium, High)
bins = [0, 5, 12, merged_df['average_discount'].max() + 1]
labels = ['Low_Discount', 'Medium_Discount', 'High_Discount']
merged_df['discount_class'] = pd.cut(merged_df['average_discount'], bins=bins, labels=labels, right=False)
print("2.4 - Discount classification successful")

# Drop the original average_discount column as we've created a class target
merged_df.dropna(subset=['discount_class'], inplace=True) 
print("2.5 - Dropped rows with NaN discount_class")

2.1 - Datetime conversion successful
2.3 - Data merging successful.
2.4 - Discount classification successful
2.5 - Dropped rows with NaN discount_class


In [24]:
merged_df.head()

Unnamed: 0,customer_id,total_sales,total_revenue,average_discount,max_quantity,unique_products,customer_class,discount_class
0,C1,13,12666.0327,8.495385,5,13,Silver,Medium_Discount
1,C1,13,12666.0327,8.495385,5,13,Silver,Medium_Discount
2,C1,13,12666.0327,8.495385,5,13,Bronze,Medium_Discount
3,C1,13,12666.0327,8.495385,5,13,Silver,Medium_Discount
4,C1,13,12666.0327,8.495385,5,13,Bronze,Medium_Discount


In [25]:
# --- 3. Model Preparation (Feature and Target Selection) ---

# Features (X) - Make a copy to avoid SettingWithCopyWarning
X = merged_df[['total_sales', 'total_revenue', 'max_quantity', 'unique_products', 'customer_class']].copy()
# Target (y)
y = merged_df['discount_class']

# Encode 'loyalty_tier' and the Target variable
le_tier = LabelEncoder()
X['customer_class_encoded'] = le_tier.fit_transform(X['customer_class'])
X = X.drop('customer_class', axis=1)

# Scale numerical features
scaler = StandardScaler()
X_numerical = X.select_dtypes(include=np.number).drop('customer_class_encoded', axis=1)
X[X_numerical.columns] = scaler.fit_transform(X_numerical)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("3.1 - Data split into training and testing sets and the data is ready for modeling")

3.1 - Data split into training and testing sets and the data is ready for modeling


In [26]:
# --- 4. Model Training (Random Forest Classifier) ---
print("4.1 - Training Random Forest Classifier...")
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)
print("4.2 - Training Complete.")

4.1 - Training Random Forest Classifier...
4.2 - Training Complete.


In [27]:
# --- 5. Model Evaluation ---
y_pred = model.predict(X_test)

print("\n--- Model Evaluation ---")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---
Accuracy Score: 0.9868

Classification Report:
                 precision    recall  f1-score   support

  High_Discount       0.96      0.94      0.95       541
   Low_Discount       1.00      1.00      1.00        13
Medium_Discount       0.99      0.99      0.99      3446

       accuracy                           0.99      4000
      macro avg       0.98      0.98      0.98      4000
   weighted avg       0.99      0.99      0.99      4000



In [28]:
# --- 6. Feature Importance (Insight) ---
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
print("\nTop 5 Most Important Features:")
print(feature_importances.nlargest(5))


Top 5 Most Important Features:
total_revenue             0.647091
total_sales               0.172723
unique_products           0.135720
max_quantity              0.031134
customer_class_encoded    0.013332
dtype: float64
