In [47]:
# parametric Algorithms
# A)  Model Design Under Constraints

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [8]:
df = pd.read_csv(r"E:/Big data exam/weekly_sales_dataset.csv")



In [9]:
df.head()


Unnamed: 0,Advertising_Spend,Price,Competitor_Price,Weekly_Sales
0,559.61,43.37,57.3,1191.85
1,483.41,45.52,45.54,750.17
2,577.72,55.98,58.09,938.41
3,682.76,54.88,61.49,1360.39
4,471.9,49.83,54.89,839.55


In [10]:
X = df[['Advertising_Spend', 'Price', 'Competitor_Price']]
y = df['Weekly_Sales']


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [12]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [15]:
y_pred = model.predict(X_test)


In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2  :", r2_score(y_test, y_pred))


MAE : 93.46242431692248
RMSE: 119.23467253853967
R2  : 0.8005903658927165


In [None]:
# A baseline linear regression model was trained to predict weekly sales.
# MAE and RMSE show the average prediction error, while R² indicates how well the model explains sales variation.
# This baseline model serves as a reference for further model risk and counterfactual analysis.

In [17]:
# 2. Perform a counterfactual exp

In [None]:
#A counterfactual scenario is created by increasing advertising spend by 20% while holding other variables constant.

In [18]:
X_counterfactual = X_test.copy()
X_counterfactual['Advertising_Spend'] = X_counterfactual['Advertising_Spend'] * 1.20


In [19]:
y_pred_cf = model.predict(X_counterfactual)


In [20]:
avg_baseline_sales = y_pred.mean()
avg_counterfactual_sales = y_pred_cf.mean()

print("Average Baseline Predicted Sales :", avg_baseline_sales)
print("Average Counterfactual Predicted Sales :", avg_counterfactual_sales)
print("Change in Sales :", avg_counterfactual_sales - avg_baseline_sales)


Average Baseline Predicted Sales : 912.4378379934234
Average Counterfactual Predicted Sales : 1107.60615660913
Change in Sales : 195.16831861570665


In [None]:
#The increase in predicted sales after a 20% rise in advertising spend is economically plausible, as advertising is expected to positively influence consumer awareness and demand.
#Since other factors are held constant, the observed change in sales can be directly attributed to advertising spend.
# The magnitude of the increase is reasonable and does not indicate unrealistic or unstable model behavior, making the counterfactual outcome suitable for managerial decision-making.

In [26]:
#3rd introduce one intentional misspecification (Eg remove a key variable or add noice ) and:
#1. Quantify the impact on coefficients and predictions 
#2. Explain the business risk created by this misspecification

In [None]:
# Intentional Model Misspecification (5 Marks)
#Chosen Misspecification
#Remove a key variable: Advertising_Spend

In [27]:
X_miss = df[['Price', 'Competitor_Price']]
y = df['Weekly_Sales']


In [28]:
from sklearn.model_selection import train_test_split

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_miss, y, test_size=0.2, random_state=42
)


In [29]:
from sklearn.linear_model import LinearRegression

model_miss = LinearRegression()
model_miss.fit(X_train_m, y_train_m)


In [30]:
print("Baseline coefficients:")
print(model.coef_)

print("\nMisspecified model coefficients:")
print(model_miss.coef_)


Baseline coefficients:
[  1.90145437 -20.08930699  12.85748125]

Misspecified model coefficients:
[-21.64002089  12.04379481]


In [None]:
#Removing advertising spend changes the magnitude and interpretation of remaining coefficients, as the model reallocates its explanatory power to price variables.

In [31]:
y_pred_baseline = model.predict(X_test)
y_pred_miss = model_miss.predict(X_test_m)

print("Average baseline prediction:", y_pred_baseline.mean())
print("Average misspecified prediction:", y_pred_miss.mean())
print("Prediction difference:", y_pred_miss.mean() - y_pred_baseline.mean())


Average baseline prediction: 912.4378379934234
Average misspecified prediction: 884.8207033944891
Prediction difference: -27.617134598934285


In [None]:
#By omitting advertising spend, the model becomes misspecified and produces biased coefficients and predictions.
# This creates business risk by underestimating the impact of marketing investments, potentially leading to reduced advertising budgets and lower future sales.
# Decisions based on this model could misallocate budgets and negatively affect revenue planning.

In [32]:
# 4th .
#I would refuse to deploy this parametric linear regression model if there is a major structural change in the market, such as a sudden shift in consumer behavior, pricing policy, or competitive landscape.
# Even if performance metrics appear strong on historical data, the model coefficients would no longer represent current relationships.
# Deploying such a model could lead to misleading sales forecasts and incorrect budget allocation decisions, creating significant business risk.

In [None]:
#Robustness & Stability Testing
#1. Train a Non-Parametric Model and Tune One Hyperparameter

In [None]:
#Select the Non-Parametric Model
#I  use Decision Tree Regression because it can model non-linear relationships without assuming a fixed functional form.

In [33]:
X = df[['Advertising_Spend', 'Price', 'Competitor_Price']]
y = df['Weekly_Sales']


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [35]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


In [36]:
#Tune One Hyperparameter  max_depth
from sklearn.metrics import mean_squared_error
import numpy as np

depths = [2, 3, 4, 5, 6]
rmse_scores = []

for d in depths:
    model = DecisionTreeRegressor(max_depth=d, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rmse_scores.append(rmse)

for d, r in zip(depths, rmse_scores):
    print(f"Max Depth: {d}, RMSE: {r:.2f}")


Max Depth: 2, RMSE: 228.15
Max Depth: 3, RMSE: 196.46
Max Depth: 4, RMSE: 181.66
Max Depth: 5, RMSE: 189.30
Max Depth: 6, RMSE: 194.58


In [37]:
best_depth = depths[np.argmin(rmse_scores)]
best_model = DecisionTreeRegressor(max_depth=best_depth, random_state=42)
best_model.fit(X_train, y_train)

print("Selected max_depth:", best_depth)


Selected max_depth: 4


In [None]:
#Explanation

#The Decision Tree Regression model was trained using different values of the hyperparameter max_depth to control model complexity. For each depth value, model performance was evaluated using RMSE on the test dataset. 
#This approach allows assessment of how increasing model flexibility affects prediction error. The depth that produced the lowest RMSE was selected as the optimal configuration.

#Interpretation
# The results show that model performance improves as tree depth increases from 2 to 4, indicating that shallow trees underfit the data. At max_depth = 4, the RMSE reaches its minimum value (≈181.66), 
# suggesting the best balance between capturing meaningful patterns and avoiding overfitting. Beyond this point, increasing depth leads to higher RMSE, indicating that the model starts fitting noise rather than true signal. Therefore, a tree depth of 4 provides the most robust and stable model for business use, as it delivers reliable predictions without excessive sensitivity to data fluctuations.

In [39]:
#Stability Test of Non-Parametric Model

In [40]:
# Original predictions
y_pred_original = best_model.predict(X_test)


In [41]:
# Remove 5% of rows
df_perturbed = df.sample(frac=0.95, random_state=42)

# Add small noise
noise = np.random.normal(0, 1, df_perturbed[['Advertising_Spend','Price','Competitor_Price']].shape)

df_perturbed[['Advertising_Spend','Price','Competitor_Price']] += noise


In [42]:
X_p = df_perturbed[['Advertising_Spend', 'Price', 'Competitor_Price']]
y_p = df_perturbed['Weekly_Sales']

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_p, y_p, test_size=0.2, random_state=42
)


In [43]:
perturbed_model = DecisionTreeRegressor(max_depth=4, random_state=42)
perturbed_model.fit(X_train_p, y_train_p)

y_pred_perturbed = perturbed_model.predict(X_test_p)


In [44]:
print("Average original prediction:", y_pred_original.mean())
print("Average perturbed prediction:", y_pred_perturbed.mean())
print("Difference:", y_pred_perturbed.mean() - y_pred_original.mean())


Average original prediction: 915.0257622742668
Average perturbed prediction: 806.8286254249748
Difference: -108.19713684929195


In [None]:
# Explanation
# A stability test was conducted by slightly perturbing the dataset through removal of 5% of observations and adding small random noise to the feature values. The decision tree model with the same tuned hyperparameter (max_depth = 4) was then retrained on the perturbed data. Predictions from the original model and the retrained model were compared using average predicted weekly sales.

# Interpretation
#The average predicted weekly sales decreased from approximately 915 in the original model to 807 after data perturbation, resulting in a difference of about −108 units.
#This indicates that the model is moderately sensitive to small changes in the training data. While the direction of predictions remains consistent, the magnitude of change suggests that the decision tree still relies on specific data patterns. From a business perspective, this level of sensitivity implies that forecasts may shift when data quality or composition changes, highlighting the need for cautious deployment and regular monitoring.

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)


In [46]:
# Linear regression retrain on perturbed data
lr_model_p = LinearRegression()
lr_model_p.fit(X_train_p, y_train_p)
lr_pred_p = lr_model_p.predict(X_test_p)

print("Linear Regression prediction change:",
      lr_pred_p.mean() - lr_pred.mean())

print("Decision Tree prediction change:",
      y_pred_perturbed.mean() - y_pred_original.mean())


Linear Regression prediction change: -101.60060093406821
Decision Tree prediction change: -108.19713684929195


In [None]:
#Predictive Stability:
#Linear regression shows a smaller prediction change (−101.6) compared to the decision tree (−108.2), indicating higher stability under data perturbation.

#Interpretability:
# Linear regression is more interpretable because coefficients directly show feature impact, while decision trees are harder to explain due to complex split rules.

In [None]:
#4. Deployment Recommendation for a Regulated Business
# For a regulated business, linear regression should be deployed instead of the non-parametric decision tree model. Although the decision tree may achieve slightly higher flexibility, linear regression offers superior interpretability, stability, and transparency, which are critical requirements in regulated environments. The ability to clearly explain model behavior and justify predictions to auditors and regulators outweighs marginal gains in predictive accuracy.