In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [4]:
data = pd.read_csv("weekly_sales_dataset.csv")

In [5]:
data.head()

Unnamed: 0,Advertising_Spend,Price,Competitor_Price,Weekly_Sales
0,559.61,43.37,57.3,1191.85
1,483.41,45.52,45.54,750.17
2,577.72,55.98,58.09,938.41
3,682.76,54.88,61.49,1360.39
4,471.9,49.83,54.89,839.55


In [7]:
data.shape

(300, 4)

#### Features and target

In [8]:
X = data[["Advertising_Spend", "Price", "Competitor_Price"]]
y = data["Weekly_Sales"]

#### Train-test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#### Train baseline Linear Regression model

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

#### Performance metrics

In [12]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print("Baseline Model Performance:")
print("RMSE:", rmse)
print("R2 Score:", r2)


Baseline Model Performance:
RMSE: 119.23467253853967
R2 Score: 0.8005903658927165




#### The RMSE shows that the prediction error is reasonable,So the model’s sales predictions are close to actual values.
#### The R² value indicates that the model explains a good portion of sales variation using advertising spend, price and competitor price.

#### coefficients

In [13]:
coeff_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_
})

print("\nModel Coefficients:")
print(coeff_df)
print("Intercept:", model.intercept_)


Model Coefficients:
             Feature  Coefficient
0  Advertising_Spend     1.901454
1              Price   -20.089307
2   Competitor_Price    12.857481
Intercept: 229.98048171239043


#### The coefficient values show how much weekly sales change when each variable changes by 1 unit, while keeping other variables constant.
#### A positive coefficient means sales increase with that variable (advertising), and a negative coefficient means sales decrease (price).

#### 2. Counterfactual experiment: Increase advertising_spend by 20%

In [18]:
X_counterfactual = X_test.copy()

In [19]:
#Increase Advertising Spend by 20%
X_counterfactual["Advertising_Spend"] = X_counterfactual["Advertising_Spend"] * 1.20

In [20]:
# Predict original and counterfactual sales
original_predictions = model.predict(X_test)
counterfactual_predictions = model.predict(X_counterfactual)

In [21]:
# Compare average effect
avg_original_sales = original_predictions.mean()
avg_counterfactual_sales = counterfactual_predictions.mean()

In [22]:
print("\nCounterfactual Experiment Results:")
print("Average predicted sales (original):", avg_original_sales)
print("Average predicted sales (after 20% ad increase):", avg_counterfactual_sales)
print("Average change in sales:", avg_counterfactual_sales - avg_original_sales)



Counterfactual Experiment Results:
Average predicted sales (original): 912.4378379934234
Average predicted sales (after 20% ad increase): 1107.60615660913
Average change in sales: 195.16831861570665


#### After increasing advertising spend by 20% (keeping other factors same), the model predicts an increase in average weekly sales. This change is economically reasonable, as higher advertising is expected to improve sales, so the result is possible for business decisions.

### 3. Misspecified model

#### Correct model (with all variables)

In [23]:
X_full = data[["Advertising_Spend", "Price", "Competitor_Price"]]
y = data["Weekly_Sales"]

X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42)

model_full = LinearRegression()
model_full.fit(X_train, y_train)

pred_full = model_full.predict(X_test)
rmse_full = mean_squared_error(y_test, pred_full, squared=False)

print("Correct Model Coefficients:")
print(pd.Series(model_full.coef_, index=X_full.columns))
print("RMSE (correct model):", rmse_full)

Correct Model Coefficients:
Advertising_Spend     1.901454
Price               -20.089307
Competitor_Price     12.857481
dtype: float64
RMSE (correct model): 119.23467253853967




#### Misspecified model (remove Advertising_Spend)

In [24]:
X_miss = data[["Price", "Competitor_Price"]]

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_miss, y, test_size=0.2, random_state=42)

model_miss = LinearRegression()
model_miss.fit(X_train_m, y_train_m)

pred_miss = model_miss.predict(X_test_m)
rmse_miss = mean_squared_error(y_test_m, pred_miss, squared=False)

print("\nMisspecified Model Coefficients:")
print(pd.Series(model_miss.coef_, index=X_miss.columns))
print("RMSE (misspecified model):", rmse_miss)



Misspecified Model Coefficients:
Price              -21.640021
Competitor_Price    12.043795
dtype: float64
RMSE (misspecified model): 219.63113893388834




#### Compare predictions

In [25]:
print("\nAverage Prediction (correct model):", pred_full.mean())
print("Average Prediction (misspecified model):", pred_miss.mean())
print("Difference in predictions:", pred_miss.mean() - pred_full.mean())


Average Prediction (correct model): 912.4378379934234
Average Prediction (misspecified model): 884.8207033944891
Difference in predictions: -27.617134598934285


#### Removing the advertising variable changes the coefficients of price and competitor price and increases prediction error (RMSE). Predicted sales also shift, showing that the model gives different and less reliable results when an important variable is missing.

#### If advertising is removed from the model, the effect of sales may be wrongly blamed on price or competitor actions. This can lead to wrong budget decisions, such as cutting advertising even when it is actually driving sales.

## 4. Scenario to refuse deployment

#### I would refuse to deploy this parametric model if there is a sudden market change, such as a new competitor entering or a major pricing policy change. Even if performance metrics look good, the old relationships in the data may no longer be valid, and using the model could lead to wrong budget allocation.

In [32]:

# Features and target
X = data[["Advertising_Spend", "Price", "Competitor_Price"]]
y = data["Weekly_Sales"]

# Train-test split (fixed once)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


rf_original = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

rf_original.fit(X_train, y_train)
pred_original = rf_original.predict(X_test)

# Misspecified / perturbed data
# Remove 5% of rows randomly
data_perturbed = data.sample(frac=0.95, random_state=1)

X_p = data_perturbed[["Advertising_Spend", "Price", "Competitor_Price"]]
y_p = data_perturbed["Weekly_Sales"]

# Train on perturbed data but test on SAME X_test
X_train_p, _, y_train_p, _ = train_test_split(
    X_p, y_p, test_size=0.2, random_state=42
)

rf_perturbed = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

rf_perturbed.fit(X_train_p, y_train_p)

# Predict on SAME test set
pred_perturbed = rf_perturbed.predict(X_test)

# ---------------- Compare predictions ----------------
diff = pred_perturbed - pred_original

avg_change = diff.mean()
rmse_change = mean_squared_error(pred_original, pred_perturbed, squared=False)

print("Average change in predictions:", avg_change)
print("RMSE between predictions:", rmse_change)


Average change in predictions: 7.500047777777727
RMSE between predictions: 76.05869982425753




#### Removing only 5% of the data causes noticeable change in predictions, showing the model is sensitive to small data changes. This indicates hidden risk in using a flexible non-parametric model like Random Forest for business decisions