In [14]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.statespace.sarimax import SARIMAX

In [15]:
data_path = "C:/Users/sarim/Python data cleaning/Personal projects/Machine Learning project Retail sales forecast/Data/"

train = pd.read_csv(os.path.join(data_path, "train.csv"))
stores = pd.read_csv(os.path.join(data_path, "stores.csv"))
features = pd.read_csv(os.path.join(data_path, "features.csv"))


#print(train.head)
#print(stores.head)
#print(features.head)

In [16]:

# Merge datasets
df = train.merge(stores, on='Store').merge(features, on=['Store', 'Date'])

# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort chronologically
df = df.sort_values(by='Date')


In [17]:
df.to_csv(os.path.join(data_path, "cleaned_retail_data.csv"), index=False)


In [18]:
# Now the data is sorted by date and saved in the pc in cleaned retail data. 


In [19]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [20]:

# Get unique store-dept pairs
store_dept_pairs = df[['Store', 'Dept']].drop_duplicates()

# Create a DataFrame to store forecasts + errors
forecast_results = []

print (forecast_results)


[]


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd

forecast_results = []

# Loop through all store and department combinations
for idx, row in store_dept_pairs.iterrows():
    store_id = row['Store']
    dept_id = row['Dept']

    print(f"🔄 Training on Store {store_id}, Dept {dept_id}")

    # Filter the full dataset
    sales = df[(df['Store'] == store_id) & (df['Dept'] == dept_id)].sort_values('Date')

    if len(sales) < 30:  # Skip short series
        print(f"⚠️ Not enough data for Store {store_id}, Dept {dept_id}")
        continue

    # Set Date as index
    sales.set_index('Date', inplace=True)

    # Use 'Weekly_Sales' for training
    train = sales['Weekly_Sales']
    
      # ✅ Check if train or test set is empty
    if train.empty:
        print(f"⚠️ Skipping Store {store_id}, Dept {dept_id} — No train/test data")
        continue

    try:
        # Train SARIMAX model on full data
        model = SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52))
        result = model.fit(disp=False)

        # Forecast next 12 weeks
        forecast = result.forecast(steps=12)

        # Save results
        forecast_results.append({
            'Store': store_id,
            'Dept': dept_id,
            'Forecast': forecast.values
        })

        print(f"✅ Done: Store {store_id}, Dept {dept_id}")

    except Exception as e:
        print(f"❌ Skipped Store {store_id}, Dept {dept_id} - Error: {e}")
        continue

🔄 Training on Store 1, Dept 1
✅ Done: Store 1, Dept 1
🔄 Training on Store 35, Dept 3
✅ Done: Store 35, Dept 3
🔄 Training on Store 35, Dept 4
✅ Done: Store 35, Dept 4
🔄 Training on Store 35, Dept 5
✅ Done: Store 35, Dept 5
🔄 Training on Store 35, Dept 6
✅ Done: Store 35, Dept 6
🔄 Training on Store 35, Dept 7
✅ Done: Store 35, Dept 7
🔄 Training on Store 35, Dept 8
✅ Done: Store 35, Dept 8
🔄 Training on Store 35, Dept 9
✅ Done: Store 35, Dept 9
🔄 Training on Store 35, Dept 10
✅ Done: Store 35, Dept 10
🔄 Training on Store 35, Dept 2
✅ Done: Store 35, Dept 2
🔄 Training on Store 35, Dept 11
✅ Done: Store 35, Dept 11
🔄 Training on Store 35, Dept 13
✅ Done: Store 35, Dept 13
🔄 Training on Store 35, Dept 14
✅ Done: Store 35, Dept 14
🔄 Training on Store 35, Dept 16
✅ Done: Store 35, Dept 16
🔄 Training on Store 35, Dept 17
✅ Done: Store 35, Dept 17
🔄 Training on Store 35, Dept 18
✅ Done: Store 35, Dept 18
🔄 Training on Store 35, Dept 20
✅ Done: Store 35, Dept 20
🔄 Training on Store 35, Dept 21
✅ 

In [None]:
forecast_df = pd.DataFrame(forecast_results)


In [None]:
forecast_df.to_csv(os.path.join(data_path, "store_dept_forecasts.csv"))


In [None]:
import matplotlib.pyplot as plt

# Pick a specific Store and Dept
example = forecast_results[0]  # or loop through a few

# Get actual sales
actual_sales = df[(df['Store'] == example['Store']) & (df['Dept'] == example['Dept'])].sort_values('Date')


In [None]:

# Plot last 50 weeks + forecast
plt.figure(figsize=(12,6))
plt.plot(actual_sales['Date'][-50:], actual_sales['Weekly_Sales'][-50:], label='Actual Sales')
plt.plot(pd.date_range(start=actual_sales['Date'].max(), periods=13, freq='W')[1:],  # skip the last actual week
         example['Forecast'], label='Forecast', linestyle='--')

plt.title(f"Store {example['Store']}, Dept {example['Dept']}")
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# Group forecasts by Store and sum the predicted values across departments
store_forecasts = forecast_df.groupby('Store')['Forecast'].apply(
    lambda x: sum(x.tolist())  # flatten and sum lists
).reset_index()

# Plot
plt.figure(figsize=(14, 8))
for _, row in store_forecasts.iterrows():
    plt.plot(row['Forecast'], label=f"Store {row['Store']}")

plt.title("Sales Forecast for All Stores (Summed Across Depts)")
plt.xlabel("Week")
plt.ylabel("Predicted Sales")
plt.legend(ncol=4, fontsize='small')  # Shrink legend if too many stores
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
top_stores = df.groupby('Store')['Weekly_Sales'].sum().sort_values(ascending=False).head(10)
top_stores.plot(kind='bar', figsize=(10,5), title='Top 10 Stores by Total Sales')
plt.ylabel('Total Sales')
plt.show()

In [None]:
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
monthly_sales = df.groupby(['Year', 'Month'])['Weekly_Sales'].sum().unstack(level=0)

monthly_sales.plot(figsize=(12,6), title='Monthly Sales Trends (2010–2012)')
plt.ylabel('Sales')
plt.xlabel('Month')
plt.xticks(range(1,13))
plt.grid(True)
plt.show()