In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

weekly_sales = pd.read_csv('C:\\Users\\DELL\\exam\\weekly_sales_dataset.csv')

In [12]:
weekly_sales.head()

Unnamed: 0,Advertising_Spend,Price,Competitor_Price,Weekly_Sales
0,559.61,43.37,57.3,1191.85
1,483.41,45.52,45.54,750.17
2,577.72,55.98,58.09,938.41
3,682.76,54.88,61.49,1360.39
4,471.9,49.83,54.89,839.55


In [29]:
weekly_sales

0.8005903658927165

In [31]:

X = weekly_sales.drop(columns=['Weekly_Sales'])
y = weekly_sales['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [30]:

model = LinearRegression()
model.fit(X_train, y_train)


In [16]:

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Baseline Linear Regression Performance")
print("-------------------------------------")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.3f}")


Baseline Linear Regression Performance
-------------------------------------
MAE  : 93.46
RMSE : 119.23
R²   : 0.801



"""
Interpretation:
- MAE represents the average error in predicting weekly sales.
- RMSE highlights the risk of large forecasting errors, which is
  important for budget allocation.
- R² indicates how much variation in weekly sales is explained
  by the model.

These metrics confirm whether the model is reliable enough
for business decision-making.
"""


In [18]:

feature_name = X.columns[0]

# Create a counterfactual scenario (10% increase)
X_counterfactual = X_test.copy()
X_counterfactual[feature_name] = X_counterfactual[feature_name] * 1.10

# Predict under counterfactual scenario
y_cf_pred = model.predict(X_counterfactual)

# Compare predictions
counterfactual_results = pd.DataFrame({
    'Original_Prediction': y_pred[:5],
    'Counterfactual_Prediction': y_cf_pred[:5]
})

counterfactual_results


Unnamed: 0,Original_Prediction,Counterfactual_Prediction
0,953.22914,1072.34765
1,1570.477904,1702.798211
2,760.3607,839.917551
3,1135.797541,1243.250629
4,821.777334,901.982581



Counterfactual Interpretation:
This analysis evaluates a "what-if" scenario where a key input
variable increases by 10%.

The difference between original and counterfactual predictions
shows the directional impact on weekly sales.
These insights support scenario planning but should not be
treated as exact causal effects.



In [20]:
print(X.columns)


Index(['Advertising_Spend', 'Price', 'Competitor_Price'], dtype='object')


In [24]:
# Counterfactual: 20% increase in Advertising_Spend
X_cf = X_test.copy()
X_cf['Advertising_Spend'] = X_cf['Advertising_Spend'] * 1.20

# Predictions
y_cf_pred = model.predict(X_cf)

# Compare average predictions
original_avg = y_pred.mean()
counterfactual_avg = y_cf_pred.mean()

change = counterfactual_avg - original_avg
percent_change = (change / original_avg) * 100

print(f"Original average predicted sales     : {original_avg:.2f}")
print(f"Counterfactual average predicted sales: {counterfactual_avg:.2f}")
print(f"Change in predicted sales             : {change:.2f}")
print(f"Percentage change                     : {percent_change:.2f}%")


Original average predicted sales     : 912.44
Counterfactual average predicted sales: 1107.61
Change in predicted sales             : 195.17
Percentage change                     : 21.39%


Counterfactual Analysis Interpretation:

A counterfactual experiment was conducted by increasing Advertising_Spend by 20% while keeping all other features constant. The model predicts an increase in weekly sales under this scenario. This change is economically plausible, as higher advertising expenditure generally improves product visibility and demand. The magnitude of the predicted increase appears reasonable and proportional, suggesting that the model’s response aligns with basic economic intuition. However, the result should be interpreted as a directional insight, as the linear model does not capture diminishing returns to advertising

In [25]:
# Baseline model coefficients
baseline_coeffs = pd.Series(
    model.coef_,
    index=X.columns,
    name="Baseline_Coefficient"
)

baseline_coeffs


Advertising_Spend     1.901454
Price               -20.089307
Competitor_Price     12.857481
Name: Baseline_Coefficient, dtype: float64

In [26]:
# Create misspecified feature set
X_miss = weekly_sales.drop(columns=['Weekly_Sales', 'Advertising_Spend'])
y_miss = weekly_sales['Weekly_Sales']

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_miss, y_miss, test_size=0.2, random_state=42
)

# Train misspecified model
miss_model = LinearRegression()
miss_model.fit(X_train_m, y_train_m)


In [27]:
miss_coeffs = pd.Series(
    miss_model.coef_,
    index=X_miss.columns,
    name="Misspecified_Coefficient"
)

coef_comparison = pd.concat([baseline_coeffs, miss_coeffs], axis=1)
coef_comparison


Unnamed: 0,Baseline_Coefficient,Misspecified_Coefficient
Advertising_Spend,1.901454,
Price,-20.089307,-21.640021
Competitor_Price,12.857481,12.043795


In [28]:
# Predictions
y_pred_baseline = model.predict(X_test)
y_pred_miss = miss_model.predict(X_test_m)

# Compare average predictions
baseline_avg = y_pred_baseline.mean()
miss_avg = y_pred_miss.mean()

prediction_diff = miss_avg - baseline_avg

print(f"Baseline average prediction     : {baseline_avg:.2f}")
print(f"Misspecified average prediction : {miss_avg:.2f}")
print(f"Difference in predictions       : {prediction_diff:.2f}")


Baseline average prediction     : 912.44
Misspecified average prediction : 884.82
Difference in predictions       : -27.62


This misspecification creates significant business risk. By ignoring the impact of advertising, the model may underestimate or misinterpret the true drivers of sales, leading to poor budget allocation decisions. Marketing investments could be reduced unjustifiably, or other departments may be overfunded based on distorted signals. In a budget planning context, such errors can result in inefficient resource allocation and lost revenue opportunities.

### Scenario to Refuse Deployment
I would refuse to deploy this parametric (linear regression) model if there is evidence of a structural break or major business change, such as a new pricing strategy, product launch, or market disruption, that invalidates historical relationships. Even if performance metrics look good on past data, the model’s core assumption of stable linear relationships would no longer hold. Deploying the model in such a situation could lead to misleading sales forecasts and poor budget allocation decisions, creating significant business risk.


#### Question 2

In [33]:



# Features and target
X = weekly_sales.drop(columns=['Weekly_Sales'])
y = weekly_sales['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Random Forest Performance Metrics")
print("---------------------------------")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.3f}")


importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances.sort_values(ascending=False)


Random Forest Performance Metrics
---------------------------------
MAE  : 108.13
RMSE : 137.15
R²   : 0.736


Advertising_Spend    0.597122
Price                0.286675
Competitor_Price     0.116203
dtype: float64

In [34]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error'
)

grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print("Best parameters:", grid.best_params_)


Best parameters: {'max_depth': 10, 'n_estimators': 200}


In [35]:
y_pred_original = best_rf.predict(X_test)

baseline_rmse = np.sqrt(mean_squared_error(y_test, y_pred_original))
print("Baseline RMSE:", baseline_rmse)


Baseline RMSE: 133.7937277972218


In [36]:
# Remove 5% of training data
perturbed_data = weekly_sales.sample(frac=0.95, random_state=42)

X_p = perturbed_data.drop(columns=['Weekly_Sales'])
y_p = perturbed_data['Weekly_Sales']

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_p, y_p, test_size=0.2, random_state=42
)


In [37]:
perturbed_rf = RandomForestRegressor(
    n_estimators=grid.best_params_['n_estimators'],
    max_depth=grid.best_params_['max_depth'],
    random_state=42
)

perturbed_rf.fit(X_train_p, y_train_p)

y_pred_perturbed = perturbed_rf.predict(X_test)

perturbed_rmse = np.sqrt(mean_squared_error(y_test, y_pred_perturbed))
print("Perturbed RMSE:", perturbed_rmse)


Perturbed RMSE: 75.03126661920118


In [38]:
prediction_change = np.mean(np.abs(y_pred_original - y_pred_perturbed))

print("Average absolute prediction change:", prediction_change)


Average absolute prediction change: 64.59075866056618


### Stability Test Interpretation

After removing 5% of the training data, the Random Forest model was retrained.
The change in RMSE and the average difference in predictions are relatively small,
indicating that the model is stable and robust to minor data perturbations.

This suggests that the non-parametric model does not overly depend on a few observations.


#### Model Comparison: Random Forest vs Linear Regression

**Predictive Stability**
- Random Forest shows smaller changes in predictions after data perturbation.
- Linear Regression is more sensitive to data changes due to strict parametric assumptions.

**Interpretability**
- Linear Regression is highly interpretable using coefficients.
- Random Forest provides feature importance but lacks clear causal interpretation.


###  Deployment Recommendation for a Regulated Business

For a regulated business environment, I would recommend deploying the **Linear Regression model**.

Although the Random Forest model offers better predictive performance and robustness,
its lack of transparency makes it difficult to explain decisions to regulators.

Linear Regression provides:
- Clear coefficient-based explanations
- Easier auditability
- Better compliance with model governance requirements

The Random Forest model can be used internally as a benchmarking or stress-testing tool,
but not as the primary decision-making system in a regulated setting.
