# Sales Forecasting Model Training

This notebook demonstrates training sales forecasting models using ARIMA, Prophet, and Linear Regression for time series forecasting.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from prophet import Prophet
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Generate Synthetic Sales Data

Create realistic sales data with trends, seasonality, and noise.

In [None]:
def generate_sales_data(n_days=730, start_date='2022-01-01'):
    """Generate synthetic sales data with realistic patterns"""
    np.random.seed(42)
    
    # Create date range
    dates = pd.date_range(start=start_date, periods=n_days, freq='D')
    
    # Generate sales components
    trend = np.linspace(1000, 2000, n_days)  # Upward trend
    
    # Weekly seasonality (higher on weekends)
    weekly_seasonality = 200 * np.sin(2 * np.pi * np.arange(n_days) / 7)
    
    # Monthly seasonality
    monthly_seasonality = 100 * np.sin(2 * np.pi * np.arange(n_days) / 30)
    
    # Yearly seasonality (holiday peaks)
    yearly_seasonality = 300 * np.sin(2 * np.pi * np.arange(n_days) / 365)
    
    # Random noise
    noise = np.random.normal(0, 50, n_days)
    
    # Combine components
    sales = trend + weekly_seasonality + monthly_seasonality + yearly_seasonality + noise
    sales = np.maximum(sales, 100)  # Ensure positive sales
    
    # Add promotional spikes
    promo_days = np.random.choice(n_days, size=20, replace=False)
    sales[promo_days] *= 1.5
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': dates,
        'sales': sales
    })
    
    return df

# Generate data
df = generate_sales_data(730)
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Sales range: ${df['sales'].min():.2f} to ${df['sales'].max():.2f}")
print(f"Average daily sales: ${df['sales'].mean():.2f}")

## 2. Exploratory Data Analysis

Visualize sales patterns and trends.

In [None]:
# Basic statistics
print("Sales Statistics:")
print(df['sales'].describe())

# Check for missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")

# Set date as index for time series analysis
df_ts = df.set_index('date')['sales']

In [None]:
# Visualize sales over time
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Daily sales
axes[0, 0].plot(df['date'], df['sales'], alpha=0.7)
axes[0, 0].set_title('Daily Sales Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Sales ($)')
axes[0, 0].tick_params(axis='x', rotation=45)

# Monthly aggregation
monthly_sales = df.set_index('date').resample('M')['sales'].sum()
axes[0, 1].plot(monthly_sales.index, monthly_sales.values, marker='o')
axes[0, 1].set_title('Monthly Sales')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Sales ($)')
axes[0, 1].tick_params(axis='x', rotation=45)

# Weekly pattern
df['day_of_week'] = df['date'].dt.dayofweek
weekly_pattern = df.groupby('day_of_week')['sales'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[1, 0].bar(day_names, weekly_pattern.values)
axes[1, 0].set_title('Average Sales by Day of Week')
axes[1, 0].set_xlabel('Day of Week')
axes[1, 0].set_ylabel('Average Sales ($)')

# Distribution of daily sales
axes[1, 1].hist(df['sales'], bins=30, alpha=0.7)
axes[1, 1].set_title('Distribution of Daily Sales')
axes[1, 1].set_xlabel('Sales ($)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Time Series Decomposition

Decompose the time series into trend, seasonal, and residual components.

In [None]:
# Perform seasonal decomposition
decomposition = seasonal_decompose(df_ts, model='additive', period=365)

# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(15, 12))

# Original series
axes[0].plot(decomposition.observed)
axes[0].set_title('Original Sales Data')
axes[0].set_ylabel('Sales ($)')

# Trend
axes[1].plot(decomposition.trend)
axes[1].set_title('Trend Component')
axes[1].set_ylabel('Sales ($)')

# Seasonal
axes[2].plot(decomposition.seasonal)
axes[2].set_title('Seasonal Component')
axes[2].set_ylabel('Sales ($)')

# Residual
axes[3].plot(decomposition.resid)
axes[3].set_title('Residual Component')
axes[3].set_ylabel('Sales ($)')
axes[3].set_xlabel('Date')

plt.tight_layout()
plt.show()

print("Time series decomposition completed!")

## 4. ARIMA Model

Train an ARIMA model for sales forecasting.

In [None]:
# Check stationarity
from statsmodels.tsa.stattools import adfuller

def check_stationarity(timeseries):
    """Check if time series is stationary"""
    result = adfuller(timeseries)
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print(f"\t{key}: {value}")
    
    if result[1] <= 0.05:
        print("Series is stationary")
    else:
        print("Series is not stationary")
    
    return result[1] <= 0.05

# Check stationarity
is_stationary = check_stationarity(df_ts)
print(f"\nIs series stationary? {is_stationary}")

In [None]:
# Split data for training and testing
train_size = int(len(df) * 0.8)
train_data = df_ts[:train_size]
test_data = df_ts[train_size:]

print(f"Training data: {len(train_data)} days")
print(f"Test data: {len(test_data)} days")
print(f"Train period: {train_data.index[0]} to {train_data.index[-1]}")
print(f"Test period: {test_data.index[0]} to {test_data.index[-1]}")

In [None]:
# Train ARIMA model
print("Training ARIMA model...")
arima_model = ARIMA(train_data, order=(1, 1, 1))
arima_result = arima_model.fit()
print("ARIMA model training completed!")

# Generate forecast
forecast_steps = len(test_data)
arima_forecast = arima_result.forecast(steps=forecast_steps)

# Calculate metrics
arima_mse = mean_squared_error(test_data, arima_forecast)
arima_mae = mean_absolute_error(test_data, arima_forecast)
arima_rmse = np.sqrt(arima_mse)

print(f"ARIMA RMSE: {arima_rmse:.2f}")
print(f"ARIMA MAE: {arima_mae:.2f}")
print(f"ARIMA MSE: {arima_mse:.2f}")

In [None]:
# Plot ARIMA results
plt.figure(figsize=(12, 6))
plt.plot(train_data.index, train_data.values, label='Training Data', color='blue')
plt.plot(test_data.index, test_data.values, label='Actual Test Data', color='green')
plt.plot(test_data.index, arima_forecast, label='ARIMA Forecast', color='red', linestyle='--')
plt.title('ARIMA Model Forecast')
plt.xlabel('Date')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Prophet Model

Train a Prophet model for sales forecasting.

In [None]:
# Prepare data for Prophet
prophet_train = train_data.reset_index()
prophet_train.columns = ['ds', 'y']

# Initialize and train Prophet
print("Training Prophet model...")
prophet_model = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,
    seasonality_mode='additive'
)
prophet_model.fit(prophet_train)
print("Prophet model training completed!")

# Create future dataframe
future = prophet_model.make_future_dataframe(periods=len(test_data))

# Generate forecast
prophet_forecast = prophet_model.predict(future)

# Extract forecast for test period
prophet_test_forecast = prophet_forecast.tail(len(test_data))['yhat'].values

# Calculate metrics
prophet_mse = mean_squared_error(test_data, prophet_test_forecast)
prophet_mae = mean_absolute_error(test_data, prophet_test_forecast)
prophet_rmse = np.sqrt(prophet_mse)

print(f"Prophet RMSE: {prophet_rmse:.2f}")
print(f"Prophet MAE: {prophet_mae:.2f}")
print(f"Prophet MSE: {prophet_mse:.2f}")

In [None]:
# Plot Prophet results
plt.figure(figsize=(12, 6))
plt.plot(train_data.index, train_data.values, label='Training Data', color='blue')
plt.plot(test_data.index, test_data.values, label='Actual Test Data', color='green')
plt.plot(test_data.index, prophet_test_forecast, label='Prophet Forecast', color='orange', linestyle='--')
plt.title('Prophet Model Forecast')
plt.xlabel('Date')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot Prophet components
fig, axes = plt.subplots(3, 1, figsize=(12, 10))

# Trend
axes[0].plot(prophet_forecast['ds'], prophet_forecast['trend'])
axes[0].set_title('Trend Component')
axes[0].set_ylabel('Sales ($)')

# Yearly seasonality
axes[1].plot(prophet_forecast['ds'], prophet_forecast['yearly'])
axes[1].set_title('Yearly Seasonality')
axes[1].set_ylabel('Sales ($)')

# Weekly seasonality
axes[2].plot(prophet_forecast['ds'], prophet_forecast['weekly'])
axes[2].set_title('Weekly Seasonality')
axes[2].set_ylabel('Sales ($)')
axes[2].set_xlabel('Date')

plt.tight_layout()
plt.show()

## 6. Linear Regression Model

Train a Linear Regression model with polynomial features.

In [None]:
# Prepare features for Linear Regression
df_train = train_data.reset_index()
df_train['day_of_week'] = df_train['date'].dt.dayofweek
df_train['month'] = df_train['date'].dt.month
df_train['year'] = df_train['date'].dt.year
df_train['day_of_year'] = df_train['date'].dt.dayofyear

# Create lag features
df_train['sales_lag_1'] = df_train['sales'].shift(1)
df_train['sales_lag_7'] = df_train['sales'].shift(7)
df_train['sales_lag_30'] = df_train['sales'].shift(30)

# Create rolling averages
df_train['sales_ma_7'] = df_train['sales'].rolling(window=7).mean()
df_train['sales_ma_30'] = df_train['sales'].rolling(window=30).mean()

# Drop NaN values
df_train = df_train.dropna()

# Prepare test data similarly
df_test = test_data.reset_index()
df_test['day_of_week'] = df_test['date'].dt.dayofweek
df_test['month'] = df_test['date'].dt.month
df_test['year'] = df_test['date'].dt.year
df_test['day_of_year'] = df_test['date'].dt.dayofyear

# Use last known values for test set
last_train_row = df_train.iloc[-1]
df_test['sales_lag_1'] = last_train_row['sales']
df_test['sales_lag_7'] = last_train_row['sales']
df_test['sales_lag_30'] = last_train_row['sales']
df_test['sales_ma_7'] = last_train_row['sales_ma_7']
df_test['sales_ma_30'] = last_train_row['sales_ma_30']

print("Data preparation completed!")
print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")

In [None]:
# Prepare features and target
feature_cols = [
    'day_of_week', 'month', 'year', 'day_of_year',
    'sales_lag_1', 'sales_lag_7', 'sales_lag_30',
    'sales_ma_7', 'sales_ma_30'
]

X_train = df_train[feature_cols]
y_train = df_train['sales']
X_test = df_test[feature_cols]
y_test = df_test['sales']

# Create polynomial features
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Train Linear Regression model
print("Training Linear Regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train_poly, y_train)
print("Linear Regression model training completed!")

# Make predictions
lr_predictions = lr_model.predict(X_test_poly)

# Calculate metrics
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_rmse = np.sqrt(lr_mse)

print(f"Linear Regression RMSE: {lr_rmse:.2f}")
print(f"Linear Regression MAE: {lr_mae:.2f}")
print(f"Linear Regression MSE: {lr_mse:.2f}")

In [None]:
# Plot Linear Regression results
plt.figure(figsize=(12, 6))
plt.plot(train_data.index, train_data.values, label='Training Data', color='blue')
plt.plot(test_data.index, test_data.values, label='Actual Test Data', color='green')
plt.plot(test_data.index, lr_predictions, label='Linear Regression Forecast', color='purple', linestyle='--')
plt.title('Linear Regression Model Forecast')
plt.xlabel('Date')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Model Comparison

Compare the performance of all three models.

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['ARIMA', 'Prophet', 'Linear Regression'],
    'RMSE': [arima_rmse, prophet_rmse, lr_rmse],
    'MAE': [arima_mae, prophet_mae, lr_mae],
    'MSE': [arima_mse, prophet_mse, lr_mse]
})

print("Model Performance Comparison:")
print(comparison_df.round(2))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# RMSE comparison
axes[0].bar(comparison_df['Model'], comparison_df['RMSE'], color=['red', 'orange', 'purple'])
axes[0].set_title('RMSE Comparison')
axes[0].set_ylabel('RMSE')

# MAE comparison
axes[1].bar(comparison_df['Model'], comparison_df['MAE'], color=['red', 'orange', 'purple'])
axes[1].set_title('MAE Comparison')
axes[1].set_ylabel('MAE')

plt.tight_layout()
plt.show()

# Find best model
best_model = comparison_df.loc[comparison_df['RMSE'].idxmin(), 'Model']
print(f"\nBest model based on RMSE: {best_model}")

In [None]:
# Plot all forecasts together
plt.figure(figsize=(14, 8))
plt.plot(train_data.index, train_data.values, label='Training Data', color='blue', alpha=0.7)
plt.plot(test_data.index, test_data.values, label='Actual Test Data', color='black', linewidth=2)
plt.plot(test_data.index, arima_forecast, label='ARIMA Forecast', color='red', linestyle='--')
plt.plot(test_data.index, prophet_test_forecast, label='Prophet Forecast', color='orange', linestyle='--')
plt.plot(test_data.index, lr_predictions, label='Linear Regression Forecast', color='purple', linestyle='--')
plt.title('Sales Forecasting Models Comparison')
plt.xlabel('Date')
plt.ylabel('Sales ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 8. Save Models

Save the trained models for deployment.

In [None]:
import joblib

# Save ARIMA model
arima_data = {
    'model': arima_result,
    'model_type': 'arima'
}
joblib.dump(arima_data, '../sales_forecast_arima.pkl')
print("ARIMA model saved!")

# Save Prophet model
prophet_data = {
    'model': prophet_model,
    'training_data': prophet_train,
    'model_type': 'prophet'
}
joblib.dump(prophet_data, '../sales_forecast_prophet.pkl')
print("Prophet model saved!")

# Save Linear Regression model
lr_data = {
    'model': lr_model,
    'scaler': poly_features,
    'feature_names': feature_cols,
    'model_type': 'linear_regression'
}
joblib.dump(lr_data, '../sales_forecast_linear.pkl')
print("Linear Regression model saved!")

print("\nAll models saved successfully!")

## Summary

This notebook demonstrated:
1. Generation of synthetic sales data with realistic patterns
2. Time series decomposition to understand components
3. ARIMA model training and evaluation
4. Prophet model training and component analysis
5. Linear Regression with polynomial features
6. Model comparison and performance evaluation
7. Model saving for deployment

The trained models can now be deployed as an API for sales forecasting.