In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df= pd.read_csv("weekly_sales_dataset.csv")

In [3]:
df

Unnamed: 0,Advertising_Spend,Price,Competitor_Price,Weekly_Sales
0,559.61,43.37,57.30,1191.85
1,483.41,45.52,45.54,750.17
2,577.72,55.98,58.09,938.41
3,682.76,54.88,61.49,1360.39
4,471.90,49.83,54.89,839.55
...,...,...,...,...
295,416.85,45.92,56.54,857.81
296,607.95,47.84,50.83,1050.11
297,536.88,42.17,53.03,1132.23
298,597.54,46.45,60.45,1233.26


## model risk and counterfactual analysis

In [4]:
# Define features and target
X = df[["Advertising_Spend", "Price", "Competitor_Price"]]
y = df["Weekly_Sales"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5   # Manual RMSE
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)



MAE: 93.46242431692251
RMSE: 119.2346725385397
R2: 0.8005903658927164


In [5]:
# -------------------------------
# Counterfactual: +20% Advertising Spend
# -------------------------------

# Create a copy of test features
X_test_cf = X_test.copy()

# Increase Advertising_Spend by 20%
X_test_cf["Advertising_Spend"] = X_test_cf["Advertising_Spend"] * 1.20

# Predict original and counterfactual sales
y_pred_original = model.predict(X_test)
y_pred_cf = model.predict(X_test_cf)

# Calculate average change
avg_original_sales = y_pred_original.mean()
avg_cf_sales = y_pred_cf.mean()
avg_change = avg_cf_sales - avg_original_sales
percent_change = (avg_change / avg_original_sales) * 100

print("Average predicted weekly sales (original):", avg_original_sales)
print("Average predicted weekly sales (20% higher advertising):", avg_cf_sales)
print("Average change in sales:", avg_change)
print("Percentage change in sales:", percent_change)


Average predicted weekly sales (original): 912.4378379934236
Average predicted weekly sales (20% higher advertising): 1107.6061566091303
Average change in sales: 195.16831861570665
Percentage change in sales: 21.38976601900998


In [6]:
# ----- Correct Model -----
from sklearn.linear_model import LinearRegression

model_full = LinearRegression()
model_full.fit(X_train, y_train)

full_coefficients = pd.Series(
    model_full.coef_,
    index=["Advertising_Spend","Price","Competitor_Price"]
)

print("Correct Model Coefficients:")
print(full_coefficients)


Correct Model Coefficients:
Advertising_Spend     1.901454
Price               -20.089307
Competitor_Price     12.857481
dtype: float64

Misspecified Model Coefficients:
Price              -21.640021
Competitor_Price    12.043795
dtype: float64

Average Prediction (Correct Model): 912.4378379934236
Average Prediction (Misspecified Model): 884.8207033944892
Difference: -27.6171345989344


In [7]:
# ----- Misspecified Model (Advertising removed) -----

X_train_miss = X_train[["Price","Competitor_Price"]]
X_test_miss = X_test[["Price","Competitor_Price"]]

model_miss = LinearRegression()
model_miss.fit(X_train_miss, y_train)

miss_coefficients = pd.Series(
    model_miss.coef_,
    index=["Price","Competitor_Price"]
)

print("\nMisspecified Model Coefficients:")
print(miss_coefficients)



Misspecified Model Coefficients:
Price              -21.640021
Competitor_Price    12.043795
dtype: float64


In [8]:
# Predictions
pred_full = model_full.predict(X_test)
pred_miss = model_miss.predict(X_test_miss)

# Average predictions
avg_full = pred_full.mean()
avg_miss = pred_miss.mean()
difference = avg_miss - avg_full

print("\nAverage Prediction (Correct Model):", avg_full)
print("Average Prediction (Misspecified Model):", avg_miss)
print("Difference:", difference)


Average Prediction (Correct Model): 912.4378379934236
Average Prediction (Misspecified Model): 884.8207033944892
Difference: -27.6171345989344


## algorithmic accountability

In [10]:
##Random Forest
X = df[["Advertising_Spend", "Price", "Competitor_Price"]]
y = df["Weekly_Sales"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor(random_state=42)

param_grid = {
    "n_estimators": [50, 100, 200, 300]
}

grid = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring="r2"
)

grid.fit(X_train, y_train)

print("Best number of trees:", grid.best_params_["n_estimators"])


Best number of trees: 300


In [11]:
##Train Best Model
best_rf = grid.best_estimator_

In [12]:
##Evaluate Tuned Model
y_pred = best_rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = (mean_squared_error(y_test, y_pred))**0.5
r2 = r2_score(y_test, y_pred)

print("Tuned Random Forest Performance")
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)


Tuned Random Forest Performance
MAE: 105.54421722222209
RMSE: 134.19704003088611
R2: 0.74740375890242


## robustness and stability test

In [13]:
X = df[["Advertising_Spend","Price","Competitor_Price"]]
y = df["Weekly_Sales"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Original model
rf_original = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)
rf_original.fit(X_train, y_train)

pred_original = rf_original.predict(X_test)


# Remove 5% of rows randomly
df_perturbed = df.sample(frac=0.95, random_state=1)

X_p = df_perturbed[["Advertising_Spend","Price","Competitor_Price"]]
y_p = df_perturbed["Weekly_Sales"]

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_p, y_p, test_size=0.2, random_state=42
)

rf_perturbed = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

rf_perturbed.fit(X_train_p, y_train_p)

pred_perturbed = rf_perturbed.predict(X_test)


# Compare prediction difference
diff = pred_perturbed - pred_original

avg_change = diff.mean()
rmse_change = (mean_squared_error(pred_original, pred_perturbed))**0.5

print("Average change in predictions:", avg_change)
print("RMSE between predictions:", rmse_change)


Average change in predictions: 7.500047777777727
RMSE between predictions: 76.05869982425753
