In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# --- 1. Generate Synthetic Data ---
data_size = 100
ad_spend = np.random.uniform(1000, 10000, data_size)
discounts = np.random.uniform(0, 0.2, data_size) # Discount percentage
customer_footfall = np.random.uniform(500, 5000, data_size)

# Define the relationship for monthly sales (target variable)
# sales = base + 15*ad_spend - 50000*discounts + 25*footfall + noise
y = (10000 + # Base sales
     15 * ad_spend +
     -50000 * discounts + # Discounts negatively impact direct revenue per sale
     25 * customer_footfall +
     np.random.normal(0, 20000, data_size)) # Adding noise

X = pd.DataFrame({
    'ad_spend': ad_spend,
    'discounts': discounts,
    'customer_footfall': customer_footfall
})

# --- 2. Setup Model and K-Fold Cross-Validation ---
# All features are numerical, so we just need to scale them.
pipeline = make_pipeline(
    StandardScaler(),
    LinearRegression()
)

# Define the number of folds
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# --- 3. Run K-Fold Cross-Validation ---
# Use cross_val_score to get the scores for each fold
mse_scores_neg = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive
mse_scores = -mse_scores_neg

print(f"--- Running {n_splits}-Fold Cross-Validation ---")
for i, mse in enumerate(mse_scores):
    print(f"Fold {i+1}: MSE = {mse:,.2f} (RMSE = {np.sqrt(mse):,.2f})")

# --- 4. Display Final Results ---
average_mse = np.mean(mse_scores)
std_dev_mse = np.std(mse_scores)

print("\n--- Cross-Validation Results ---")
print(f"Average MSE: {average_mse:,.2f}")
print(f"Average RMSE (prediction error): {np.sqrt(average_mse):,.2f}")
print(f"Standard Deviation of MSE: {std_dev_mse:,.2f}")

# --- 5. Optional: Train Final Model and Inspect Coefficients ---
final_model = pipeline.fit(X, y)

# Get coefficients from the LinearRegression step in the pipeline
coefficients = final_model.named_steps['linearregression'].coef_
intercept = final_model.named_steps['linearregression'].intercept_

print("\n--- Final Model (Trained on all data) ---")
print(f"Intercept (Base Sales): {intercept:,.2f}")
print("Coefficients (for scaled features):")
for feature, coef in zip(X.columns, coefficients):
    print(f"  {feature}: {coef:,.2f}")



--- Running 5-Fold Cross-Validation ---
Fold 1: MSE = 840,631,824.48 (RMSE = 28,993.65)
Fold 2: MSE = 298,257,488.74 (RMSE = 17,270.13)
Fold 3: MSE = 270,963,033.60 (RMSE = 16,460.95)
Fold 4: MSE = 162,767,899.37 (RMSE = 12,758.05)
Fold 5: MSE = 559,697,080.05 (RMSE = 23,657.92)

--- Cross-Validation Results ---
Average MSE: 426,463,465.25
Average RMSE (prediction error): 20,650.99
Standard Deviation of MSE: 244,719,894.58

--- Final Model (Trained on all data) ---
Intercept (Base Sales): 155,622.79
Coefficients (for scaled features):
  ad_spend: 42,422.05
  discounts: -1,699.98
  customer_footfall: 34,768.09
