In [None]:
import pandas as pd

# Load the dataset
df_sales = pd.read_csv('sales_20k_final.csv')

# Convert 'sale_date' to datetime
df_sales['sale_date'] = pd.to_datetime(df_sales['sale_date'])

# Calculate spend per transaction
# Formula: quantity * price * (1 - discount / 100)
df_sales['spend'] = df_sales['quantity'] * df_sales['price'] * (1 - df_sales['discount'] / 100)

# Extract Year and Quarter
df_sales['Year'] = df_sales['sale_date'].dt.year
df_sales['Quarter'] = df_sales['sale_date'].dt.to_period('Q')

# 1. Yearly Spend per Customer
yearly_spend = df_sales.groupby(['customer_id', 'Year'])['spend'].sum().reset_index()
yearly_spend.rename(columns={'spend': 'total_yearly_spend'}, inplace=True)

# 2. Quarterly Spend per Customer
quarterly_spend = df_sales.groupby(['customer_id', 'Quarter'])['spend'].sum().reset_index()
quarterly_spend.rename(columns={'spend': 'total_quarterly_spend'}, inplace=True)

# Displaying the first few rows of the results
print("Yearly Spend per Customer:")
print(yearly_spend.head())

print("\nQuarterly Spend per Customer:")
print(quarterly_spend.head())

Yearly Spend per Customer:
  customer_id  Year  total_yearly_spend
0          C1  2021             7293.60
1          C1  2022             3966.95
2          C1  2023              737.60
3         C10  2021             2338.20
4         C10  2022             4745.80

Quarterly Spend per Customer:
  customer_id Quarter  total_quarterly_spend
0          C1  2021Q1                 1144.0
1          C1  2021Q2                  568.1
2          C1  2021Q3                 2528.1
3          C1  2021Q4                 3053.4
4          C1  2022Q1                  281.2


In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# 1. Load & prepare data
# -----------------------------
sales = pd.read_csv('sales_20k_final.csv', parse_dates=['sale_date'])
sales['sale_date'] = pd.to_datetime(sales['sale_date'])
sales['month'] = sales['sale_date'].dt.to_period('M').dt.to_timestamp()
sales['spend'] = sales['price'] * sales['quantity'] * (1 - sales.get('discount', 0)/100)

# monthly spend per customer
monthly = sales.groupby(['customer_id','month']).spend.sum().reset_index()

# pivot to wide format
all_months = pd.date_range(monthly['month'].min(), monthly['month'].max(), freq='MS')
pivot = monthly.pivot_table(index='customer_id', columns='month', values='spend', fill_value=0)

arr = pivot.values
n_customers, n_months = arr.shape

# -----------------------------
# 2. Create Targets
# -----------------------------
future_q = np.zeros_like(arr)
future_y = np.zeros_like(arr)

for t in range(n_months):
    if t+3 < n_months:
        future_q[:, t] = arr[:, t+1:t+4].sum(axis=1)
    if t+12 < n_months:
        future_y[:, t] = arr[:, t+1:t+13].sum(axis=1)

# -----------------------------
# 3. Lag features
# -----------------------------
def lag(arr, k):
    out = np.zeros_like(arr)
    out[:, k:] = arr[:, :-k]
    return out

lag1  = lag(arr, 1)
lag3  = lag(arr, 3)
lag6  = lag(arr, 6)
lag12 = lag(arr, 12)

# stack features
X = np.stack([arr, lag1, lag3, lag6, lag12], axis=-1).reshape(-1, 5)

y_q = future_q.reshape(-1)
y_y = future_y.reshape(-1)

# keep valid rows
valid_q = y_q > 0
valid_y = y_y > 0

X_q = X[valid_q]
y_q = y_q[valid_q]

X_y = X[valid_y]
y_y = y_y[valid_y]

# -----------------------------
# 4. Train Random Forest
# -----------------------------
rf_q = RandomForestRegressor(n_estimators=120, random_state=42)
rf_y = RandomForestRegressor(n_estimators=120, random_state=42)

rf_q.fit(X_q, y_q)
rf_y.fit(X_y, y_y)

# predictions
pred_q = rf_q.predict(X_q)
pred_y = rf_y.predict(X_y)

# -----------------------------
# 5. Print model performance
# -----------------------------
print("\n--- Next Quarter Spend Model ---")
print("MAE:", mean_absolute_error(y_q, pred_q))
print("RMSE:", mean_squared_error(y_q, pred_q)**0.5)
print("R2:", r2_score(y_q, pred_q))

print("\n--- Next Year Spend Model ---")
print("MAE:", mean_absolute_error(y_y, pred_y))
print("RMSE:", (mean_squared_error(y_y, pred_y))**0.5)
print("R2:", r2_score(y_y, pred_y))




--- Next Quarter Spend Model ---
MAE: 439.8682230653471
RMSE: 603.8508335172996
R2: 0.4454101406181491

--- Next Year Spend Model ---
MAE: 888.0108126642
RMSE: 1189.2497332278087
R2: 0.4017732132961931
