In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_absolute_error, r2_score

# Load the datasets
train_df = pd.read_csv('/mnt/data/train.csv')
features_df = pd.read_csv('/mnt/data/features.csv')
stores_df = pd.read_csv('/mnt/data/stores.csv')
test_df = pd.read_csv('/mnt/data/test.csv')

# Merge the datasets
train_merged = train_df.merge(features_df, on='store', how='left')
train_merged = train_merged.merge(stores_df, on='store', how='left')

# Check for missing values and fill them
print(f"Missing values in merged data: \n{train_merged.isnull().sum()}")
train_merged.fillna(train_merged.mean(), inplace=True)

# Feature Engineering: Convert the 'date' to datetime and create 'week_of_year' feature
train_merged['date'] = pd.to_datetime(train_merged['date'])
train_merged['week_of_year'] = train_merged['date'].dt.isocalendar().week

# Encoding categorical features (e.g., 'store_type' and 'holiday')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Encoding 'store_type' as an example categorical feature
train_merged['store_type_encoded'] = le.fit_transform(train_merged['store_type'])

# Encoding 'holiday' feature if it exists
if 'holiday' in train_merged.columns:
    train_merged['holiday_encoded'] = le.fit_transform(train_merged['holiday'])

# Define features and target variable
features = ['store_type_encoded', 'week_of_year', 'temperature', 'fuel_price', 'CPI', 'unemployment', 'holiday_encoded']
X = train_merged[features]
y = train_merged['sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

# Weighted Mean Absolute Error (WMAE) calculation
weights = X_test['holiday_encoded'].apply(lambda x: 5 if x == 1 else 1)
wmae = (weights * abs(y_pred - y_test)).sum() / weights.sum()
print(f"Weighted Mean Absolute Error (WMAE): {wmae}")

# Visualizing the model's predictions vs. actual sales
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='True Sales', color='blue')
plt.plot(y_pred, label='Predicted Sales', color='red', linestyle='dashed')
plt.title('True vs Predicted Sales')
plt.xlabel('Test Sample Index')
plt.ylabel('Sales')
plt.legend()
plt.show()

# Evaluate WMAE separately for holiday and non-holiday weeks
holiday_mask = X_test['holiday_encoded'] == 1
non_holiday_mask = X_test['holiday_encoded'] == 0

holiday_wmae = (weights[holiday_mask] * abs(y_pred[holiday_mask] - y_test[holiday_mask])).sum() / weights[holiday_mask].sum()
non_holiday_wmae = (weights[non_holiday_mask] * abs(y_pred[non_holiday_mask] - y_test[non_holiday_mask])).sum() / weights[non_holiday_mask].sum()

print(f"Holiday Week WMAE: {holiday_wmae}")
print(f"Non-Holiday Week WMAE: {non_holiday_wmae}")

# Visualizing the Decision Tree (Optional)
plt.figure(figsize=(20,10))
plot_tree(model, filled=True, feature_names=features, rounded=True, class_names=['Low', 'High'])
plt.show()

# Feature Importance: Which features are most important for the model
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(feature_importance_df)


KeyboardInterrupt: 