**Step 1: Problem Definition and Data Understanding**

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
data = pd.read_csv('Walmart_Store_sales.csv')  # Replace with the actual file path

# Inspect the columns, data types, and initial statistics
print(data.info())
print(data.describe())

# Focus on key features
print(data[['Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']].head())


**Step 2: Data Cleaning and Preprocessing**

In [None]:
from statsmodels.tsa.stattools import adfuller

# Check for stationarity using Augmented Dickey-Fuller test
def check_stationarity(series):
    result = adfuller(series.dropna())
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    if result[1] > 0.05:
        print("Series is non-stationary")
    else:
        print("Series is stationary")

check_stationarity(data['Weekly_Sales'])

# Apply transformations if non-stationary
data['Weekly_Sales_diff'] = data['Weekly_Sales'].diff().dropna()


Feature Engineering

In [None]:
# Create new time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['Month'] = data['Date'].dt.month
data['Season'] = data['Month'].apply(lambda x: 'Winter' if x in [12, 1, 2] else ('Spring' if x in [3, 4, 5] else ('Summer' if x in [6, 7, 8] else 'Fall')))

# Expand Holiday_Flag to specific holiday flags (example shown with placeholders)
data['Super_Bowl'] = data['Holiday_Flag'] & (data['Date'].isin(['YYYY-MM-DD']))  # Replace with actual Super Bowl dates
data['Labor_Day'] = data['Holiday_Flag'] & (data['Date'].isin(['YYYY-MM-DD']))  # Replace with actual Labor Day dates

# Generate rolling averages and lagged features for Weekly_Sales
data['Weekly_Sales_Rolling'] = data['Weekly_Sales'].rolling(window=4).mean()
data['Weekly_Sales_Lag1'] = data['Weekly_Sales'].shift(1)


Encoding Categorical Features

In [None]:
# Encode categorical features
data = pd.get_dummies(data, columns=['Holiday_Flag', 'Season'], drop_first=True)


Scaling Continuous Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale continuous features
scaler = StandardScaler()
data[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']] = scaler.fit_transform(data[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']])


**Step 3: Exploratory Data Analysis (EDA)**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

# Seasonality and trend analysis
data.set_index('Date', inplace=True)
decomposition = seasonal_decompose(data['Weekly_Sales'], model='additive', period=52)
decomposition.plot()
plt.show()

# Holiday impact analysis
sns.boxplot(x='Holiday_Flag', y='Weekly_Sales', data=data)
plt.title("Impact of Holidays on Weekly Sales")
plt.show()

# Correlation analysis
corr = data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


**Step 4: Data Preparation for Modeling**

In [None]:
from sklearn.model_selection import train_test_split

# Train-test split
train_data = data[data['Date'] < 'YYYY-MM-DD']  # Define split date
test_data = data[data['Date'] >= 'YYYY-MM-DD']  # Define split date

# Handling Non-Stationarity
train_data['Weekly_Sales_diff'] = train_data['Weekly_Sales'].diff().dropna()
test_data['Weekly_Sales_diff'] = test_data['Weekly_Sales'].diff().dropna()

# Feature selection based on feature importance from initial model
features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'WeekOfYear', 'Super_Bowl', 'Labor_Day', 'Weekly_Sales_Lag1', 'Weekly_Sales_Rolling']
target = 'Weekly_Sales'


**Step 5: Model Selection and Training**

Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(train_data[features], train_data[target])

# Predict and calculate error
predictions = rf_model.predict(test_data[features])
print("MAE:", mean_absolute_error(test_data[target], predictions))


ARIMA/SARIMA

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Train ARIMA model
arima_model = ARIMA(train_data['Weekly_Sales_diff'].dropna(), order=(1, 1, 1))
arima_results = arima_model.fit()
print(arima_results.summary())


**Step 6: Model Evaluation**
Weighted Mean Absolute Error (WMAE)

In [None]:
import numpy as np

# Calculate WMAE
def weighted_mae(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

holiday_weeks = test_data['Holiday_Flag'] == 1
weights = np.where(holiday_weeks, 5, 1)  # Assign higher weights to holiday weeks
wmae = weighted_mae(test_data[target], predictions, weights)
print("WMAE:", wmae)


Additional Evaluation Metrics

In [None]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_data[target], predictions))
print("RMSE:", rmse)


**Step 7: Insights and Business Recommendations**

Using your findings, provide actionable recommendations to Walmart based on the results:

In [None]:
# Insights Summary
print("Holiday weeks have significantly higher sales, with strong seasonality around major holidays.")
print("Consider stocking more inventory and increasing promotions around Super Bowl and Labor Day.")
print("Accurate demand predictions can reduce stockouts, optimize inventory, and boost revenue during peak periods.")
