In [6]:

# a) model design under constraints 
import pandas as pd
from sklearn.linear_model import LinearRegression

df = pd.read_csv("D:\\bigg data\\weekly_sales_dataset.csv")

print(df.columns) 

X = df.drop('Weekly_Sales', axis=1)
y = df['Weekly_Sales']

model = LinearRegression()
model.fit(X, y)


Index(['Advertising_Spend', 'Price', 'Competitor_Price', 'Weekly_Sales'], dtype='object')


In [5]:
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Impact_on_Sales": model.coef_
})

coefficients


Unnamed: 0,Feature,Impact_on_Sales
0,Advertising_Spend,1.899227
1,Price,-20.12495
2,Competitor_Price,13.372099


In [7]:

#Train a Baseline Model & Report Performance

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split


df = pd.read_csv("D:\\bigg data\\weekly_sales_dataset.csv")

# Define features and target
X = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Performance metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2
#Interpretation : The baseline Linear Regression model provides stable and understandable predictions, 
#making it suitable for business budgeting decisions where consistency is more important than extreme precision.



(93.46242431692248, 0.8005903658927165)

In [8]:
#Counterfactual Experiment

X_counterfactual = X_test.copy()

# Increase advertising spend by 20%
X_counterfactual["Advertising_Spend"] = (
    X_counterfactual["Advertising_Spend"] * 1.20
)

# Predict sales under counterfactual scenario
baseline_pred = model.predict(X_test)
counterfactual_pred = model.predict(X_counterfactual)

# Average change in sales
avg_change = (counterfactual_pred - baseline_pred).mean()
avg_change
#A moderate increase in advertising budget resulting in higher sales is economically reasonable.
#The model does not show unrealistically large jumps, indicating stable and sensible behavior.

np.float64(195.1683186157067)

In [10]:
#Intentional Model Misspecification
# Remove a key variable
X_misspecified = X.drop("Advertising_Spend", axis=1)

# Train-test split again
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_misspecified, y, test_size=0.2, random_state=42
)

# Train misspecified model
model_miss = LinearRegression()
model_miss.fit(X_train_m, y_train_m)

# Predictions
y_pred_miss = model_miss.predict(X_test_m)

# Performance comparison
mae_miss = mean_absolute_error(y_test_m, y_pred_miss)
r2_miss = r2_score(y_test_m, y_pred_miss)

mae_miss, r2_miss
#Removing an important variable reduces the model’s accuracy and forces other variables to behave unrealistically.
#As a result, the sales predictions become less reliable.
#The model may underestimate the value of advertising and suggest lower budgets than required.
#This can lead to lost sales and reduced trust in data-driven decisions.

(175.69033044243196, 0.32340491694348117)

In [13]:
#Question 2: Non-Parametric Algorithms 
#a) Algorithmic Accountability 
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load dataset
df = pd.read_csv("D:\\bigg data\\weekly_sales_dataset.csv")

# Define features and target
X = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train non-parametric model
model = DecisionTreeRegressor(max_depth=4)
model.fit(X_train, y_train)

# Predictions and performance
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

#Interpretation:
#The notebook output demonstrates that the Decision Tree can achieve reasonable accuracy, 
#but its flexibility requires careful control (such as limiting tree depth) to reduce business risk and maintain accountability.


149.3580598999215

In [15]:
#Train the Non-Parametric Model & Tune One Hyperparameter

# Features and target
X = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Tune hyperparameter: max_depth
tree_model = DecisionTreeRegressor(max_depth=4)
tree_model.fit(X_train, y_train)

# Predictions
tree_pred = tree_model.predict(X_test)

# Performance
tree_mae = mean_absolute_error(y_test, tree_pred)
tree_mae
#The Decision Tree model is trained with a controlled depth to reduce overfitting.
#Tuning the max_depth parameter balances flexibility and stability, which is important for business us

149.3580598999215

In [19]:
#Stability Test
#a) Slightly Perturb the Data (Remove 5% Rows)
# Remove 5% of rows to simulate data perturbation
df_perturbed = df.sample(frac=0.95, random_state=42)

X_p = df_perturbed.drop("Weekly_Sales", axis=1)
y_p = df_perturbed["Weekly_Sales"]

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_p, y_p, test_size=0.2, random_state=42
)
#b) Retrain the Model on Perturbed Data
tree_model_perturbed = DecisionTreeRegressor(max_depth=4)
tree_model_perturbed.fit(X_train_p, y_train_p)

tree_pred_perturbed = tree_model_perturbed.predict(X_test_p)
#c) Compare Predictions Before and After
# Compare average predictions
avg_original = tree_pred.mean()
avg_perturbed = tree_pred_perturbed.mean()

#avg_original, avg_perturbed
#After a small change in data, the predictions show noticeable variation.
#This indicates that the non-parametric model is sensitive to data changes, which raises stability concerns for business deployment.

(np.float64(915.0257622742668), np.float64(779.0169326904752))

In [20]:
#Comparison with Linear Regression
# Train Linear Regression model
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

lin_pred = lin_model.predict(X_test)

lin_mae = mean_absolute_error(y_test, lin_pred)
lin_mae
#a) Predictive Stability:
#Linear Regression shows more stable predictions when data changes, while the Decision Tree reacts more strongly to small perturbations.

#b) Interpretability:
#Linear Regression is easier to explain since coefficients directly show impact.
#Decision Trees are harder to interpret as complexity increases.

93.46242431692248