In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

print("[INFO] Libraries imported successfully")

In [None]:
data = pd.read_csv('/kaggle/input/store-sales-forecasting-dataset/stores_sales_forecasting.csv', encoding="latin1")
print("[INFO] Dataset loaded, shape:", data.shape)
print(data.head())

In [None]:
print(data.columns)

In [None]:
# Convert Order Date to datetime
data['Order Date'] = pd.to_datetime(data['Order Date'])

# Sort by Order Date
data = data.sort_values('Order Date')

# Create lag feature (previous day's sales)
data['sales_lag_1'] = data['Sales'].shift(1)

# Extract time-based features
data['month'] = data['Order Date'].dt.month
data['day_of_week'] = data['Order Date'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Rolling average (last 3 days sales)
data['rolling_avg_3'] = data['Sales'].rolling(window=3).mean().shift(1)

# Handle missing values
data = data.bfill()

print("[INFO] Preprocessing done. Shape:", data.shape)
print("[INFO] Features created:", ['sales_lag_1', 'month', 'day_of_week', 'is_weekend', 'rolling_avg_3'])


In [None]:
features = ['sales_lag_1', 'month', 'day_of_week', 'is_weekend', 'rolling_avg_3']
X = data[features]
y = data['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print("[INFO] Train size:", X_train.shape[0])
print("[INFO] Test size:", X_test.shape[0])


In [None]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"[RESULT] Linear Regression MSE: {mse_lr}")

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"[RESULT] Random Forest MSE: {mse_rf}")


In [None]:
baseline_pred = X_test['sales_lag_1']
mse_baseline = mean_squared_error(y_test, baseline_pred)
print(f"[RESULT] Baseline MSE: {mse_baseline}")

improvement = ((mse_baseline - mse_rf) / mse_baseline) * 100
print(f"[RESULT] Improvement over baseline: {improvement:.2f}%")


In [None]:
plt.figure(figsize=(12,6))
plt.plot(y_test.values, label='Actual Sales')
plt.plot(y_pred_rf, label='Predicted Sales (Random Forest)')
plt.legend()
plt.title('Sales Forecasting Model')
plt.xlabel('Time Steps')
plt.ylabel('Sales')
plt.show()
