In [38]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import statsmodels.api as sm

In [2]:
df = pd.read_csv('weekly_sales_dataset.csv')
df.head()


Unnamed: 0,Advertising_Spend,Price,Competitor_Price,Weekly_Sales
0,559.61,43.37,57.3,1191.85
1,483.41,45.52,45.54,750.17
2,577.72,55.98,58.09,938.41
3,682.76,54.88,61.49,1360.39
4,471.9,49.83,54.89,839.55


In [3]:
X = df.drop('Weekly_Sales', axis=1)
y = df['Weekly_Sales']


In [None]:
###QUESTION 1A

In [35]:
X = df[['Advertising_Spend', 'Price', 'Competitor_Price']]
y = df['Weekly_Sales']

model = LinearRegression()
model.fit(X, y)

coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})

coefficients


Unnamed: 0,Feature,Coefficient
0,Advertising_Spend,1.899227
1,Price,-20.12495
2,Competitor_Price,13.372099


In [36]:
X_simple = df[['Advertising_Spend', 'Price', 'Competitor_Price']]
y = df['Weekly_Sales']

simple_model = LinearRegression()
simple_model.fit(X_simple, y)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [39]:
X = df[['Advertising_Spend', 'Price', 'Competitor_Price']]
X = sm.add_constant(X)
y = df['Weekly_Sales']

ols_model = sm.OLS(y, X).fit()
print(ols_model.summary())


                            OLS Regression Results                            
Dep. Variable:           Weekly_Sales   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.848
Method:                 Least Squares   F-statistic:                     556.9
Date:                Sat, 31 Jan 2026   Prob (F-statistic):          2.38e-121
Time:                        10:16:13   Log-Likelihood:                -1868.3
No. Observations:                 300   AIC:                             3745.
Df Residuals:                     296   BIC:                             3760.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               200.1779     80.09

In [None]:
##QUESTION 1B

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [6]:
y_pred = model.predict(X_test)


In [7]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)

r2, mse, rmse, mae


(0.8005903658927166, 14216.907135372776, 119.23467253853963, 93.46242431692242)

In [8]:
pd.Series(model.coef_, index=X.columns), model.intercept_


(Advertising_Spend     1.901454
 Price               -20.089307
 Competitor_Price     12.857481
 dtype: float64,
 229.98048171239145)

'''The model explains about 80% of the changes in weekly sales, which means it captures most of what drives sales.
When advertising spend increases by ₹1, weekly sales increase by about 2 units.
When the product price increases by ₹1, weekly sales drop by about 20 units, showing customers are highly price-sensitive.
When competitors increase their price by ₹1, our weekly sales increase by about 13 units, as customers shift towards our product.
On average, the model’s sales predictions differ from actual sales by around 90–120 units per week, which is reasonable given that weekly sales are typically in the range of several hundred to over a thousand units.
Overall, this model gives a clear and measurable understanding of how pricing and advertising affect sales and serves as a reliable starting point for further analysis.'''


In [12]:
#Counterfactual Experiment

In [14]:
X_test_cf = X_test.copy()
X_test_cf['Advertising_Spend'] = X_test_cf['Advertising_Spend'] * 1.20


In [15]:
y_pred_original = model.predict(X_test)
y_pred_cf = model.predict(X_test_cf)


In [16]:
sales_change = y_pred_cf - y_pred_original
sales_change.mean(), sales_change.min(), sales_change.max()


(195.16831861570665, 104.93366072943502, 302.5556160610706)

In [17]:
percentage_change = (sales_change / y_pred_original) * 100
percentage_change.mean()


22.099521245704942

#Advertising spend was increased by 20%, keeping all other factors constant.
#Weekly sales increased by ~195 units on average.
#The increase ranged from ~105 to ~303 units, depending on the case.
#This corresponds to an average sales increase of ~22%.
#The change is economically plausible, as the sales increase is roughly proportional to the increase in advertising spend and not unrealistically large.

In [None]:
#MISSPECIFICATION

In [18]:
X_miss = df.drop(['Weekly_Sales', 'Price'], axis=1)
y = df['Weekly_Sales']

In [19]:
X_train_m, X_test_m, y_train, y_test = train_test_split(
    X_miss, y, test_size=0.2, random_state=42
)

In [20]:
model_miss = LinearRegression()
model_miss.fit(X_train_m, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [21]:
y_pred_miss = model_miss.predict(X_test_m)

r2_miss = r2_score(y_test, y_pred_miss)
rmse_miss = mean_squared_error(y_test, y_pred_miss) ** 0.5
r2_miss, rmse_miss

(0.47019899294151213, 194.35059956054076)

In [22]:
pd.Series(model_miss.coef_, index=X_miss.columns), model_miss.intercept_

(Advertising_Spend     1.966493
 Competitor_Price     14.628596
 dtype: float64,
 -908.0394508277429)

#Scenario to Refuse Deployment
#This model should not be deployed if the business plans to use it for pricing or advertising decisions during a major market change—for example, a sudden price hike, heavy discount campaign, new competitor entry, or regulatory change.
#Even if the metrics look strong, the model is built on past, stable relationships. In a changing market, customer behaviour may no longer respond to price or advertising in the same way. Deploying the model in such a situation risks making confident but wrong decisions, such as increasing advertising when customers are actually reacting to price shocks or external factors the model cannot see.

In [None]:
#QUESTION 2 (A)

In [33]:
median_sales = df["Weekly_Sales"].median()
df["High_Sales"] = (df["Weekly_Sales"] > median_sales).astype(int)


X = df[["Advertising_Spend", "Price", "Competitor_Price"]]
y = df["High_Sales"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

deep_tree = DecisionTreeClassifier(max_depth=None, random_state=42)
deep_tree.fit(X_train, y_train)

deep_train_acc = accuracy_score(y_train, deep_tree.predict(X_train))
deep_test_acc = accuracy_score(y_test, deep_tree.predict(X_test))

shallow_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
shallow_tree.fit(X_train, y_train)

shallow_train_acc = accuracy_score(y_train, shallow_tree.predict(X_train))
shallow_test_acc = accuracy_score(y_test, shallow_tree.predict(X_test))

deep_train_acc, deep_test_acc, shallow_train_acc, shallow_test_acc


(1.0, 0.84, 0.8488888888888889, 0.7866666666666666)

'''
Model fit collapsed
R² fell from 0.80 → 0.47
More than half of the model’s explanatory power is lost
Prediction error increased sharply

RMSE rose from 119 → 194 units
→Predictions are now wrong by ~75 more units per week

Coefficients became distorted
Advertising_Spend: 1.90 → 1.97
Competitor_Price: 12.86 → 14.63 
Intercept turned −908 '''

'''Business risk created

Sales forecasts become unreliable
Advertising looks more powerful than it really is
Leads to overspending on ads
Price decisions are made without understanding demand loss
High risk of wrong growth and profit decisions'''

In [None]:
##question 2(b)

In [24]:
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X, y = make_classification(n_samples=600, n_features=6, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)

accuracy_score(y_train, tree.predict(X_train)), accuracy_score(y_test, tree.predict(X_test))


(0.9422222222222222, 0.9)

In [26]:
X = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

tree = DecisionTreeRegressor(max_depth=3, random_state=42)
tree.fit(X_train, y_train)

pred_tree = tree.predict(X_test)
mae_tree = mean_absolute_error(y_test, pred_tree)

mae_tree


158.84056400687945

#MAE ≈ 158.8 - On average, weekly sales predictions are off by about 159 units.
#The decision tree captures non-linear patterns, but its errors are still fairly large. This suggests the model is learning rules that fit the data unevenly.

In [28]:
perturbed_df = df.sample(frac=0.95, random_state=1)

Xp = perturbed_df.drop("Weekly_Sales", axis=1)
yp = perturbed_df["Weekly_Sales"]

Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    Xp, yp, test_size=0.2, random_state=42
)

tree_perturbed = DecisionTreeRegressor(max_depth=3, random_state=42)
tree_perturbed.fit(Xp_train, yp_train)

pred_tree_perturbed = tree_perturbed.predict(X_test)

avg_prediction_change = abs(pred_tree - pred_tree_perturbed).mean()
avg_prediction_change


109.56199693502386

'''The decision tree was first trained on the original dataset and used to generate baseline predictions.
Next, 5% of the data was removed, and the model was retrained from scratch using the same hyperparameters.
Predictions from the original model and the retrained model were then compared on the same test set.
The average difference between the two sets of predictions was approximately 110 units.
This means that even without changing the model structure, a small change in the training data led to material changes in predicted sales.
This clearly indicates that the decision tree is data-sensitive and unstable, as its decision rules shift when the training sample changes slightly.''

In [29]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
mae_lr = mean_absolute_error(y_test, pred_lr)

mae_lr


93.46242431692242

'''Predictive stability:
The decision tree shows low stability. After retraining on a dataset with 5% of rows removed, predictions changed by about 110 units on average, indicating high sensitivity to small data changes.
Linear regression is more stable, as small data perturbations lead to minimal changes in predictions.

Interpretability:
Decision tree rules and split points change when the model is retrained, making explanations inconsistent over time.
Linear regression provides clear and consistent interpretation through stable coefficients with direct economic meaning.''

##4th answer 
#Even though the decision tree can adapt to complex patterns, its predictions change noticeably when the data changes slightly. In a regulated business, this is risky because the same situation could lead to different outcomes at different times, making decisions hard to justify during audits. Linear regression behaves more consistently and is easier to explain, which makes it safer and more reliable for regulatory and governance needs.