In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.seasonal import seasonal_decompose
import os

class DemandForecasting:
    def __init__(self):
        self.raw_data = self.generate_synthetic_sales_data()
        self.preprocessed_data = None
        self.model = None

    def generate_synthetic_sales_data(self, n_periods=1000):
        """Generate synthetic time series sales data"""
        np.random.seed(42)

        # Create date range
        dates = pd.date_range(start='2020-01-01', periods=n_periods)

        # Synthetic features
        product_categories = ['Organic Food', 'Supplements', 'Health Drinks']
        data = {
            'date': dates,
            'product_category': np.random.choice(product_categories, n_periods),
            'base_sales': np.random.normal(1000, 200, n_periods),
            'seasonality_factor': np.sin(np.linspace(0, 4*np.pi, n_periods)) * 200 + 200,
            'economic_indicator': np.random.normal(100, 20, n_periods),
            'marketing_spend': np.random.normal(5000, 1000, n_periods)
        }

        # Calculate total sales with noise and factors
        df = pd.DataFrame(data)
        df['total_sales'] = (df['base_sales'] +
                              df['seasonality_factor'] +
                              df['economic_indicator']/10 +
                              df['marketing_spend']/100 +
                              np.random.normal(0, 100, n_periods))

        # Ensure directories exist
        os.makedirs('data', exist_ok=True)
        os.makedirs('output', exist_ok=True)

        # Export synthetic dataset
        df.to_csv('data/healthypact_sales_data.csv', index=False)
        return df

    def create_preprocessing_pipeline(self):
        # Identify numeric and categorical columns
        numeric_features = [
            'base_sales',
            'seasonality_factor',
            'economic_indicator',
            'marketing_spend'
        ]
        categorical_features = ['product_category']

        # Create preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(drop='first'), categorical_features)
            ])

        return preprocessor

    def preprocess_data(self):
        # Feature engineering
        self.raw_data['month'] = self.raw_data['date'].dt.month
        self.raw_data['quarter'] = self.raw_data['date'].dt.quarter

        # Prepare features and target
        X = self.raw_data[['product_category', 'base_sales', 'seasonality_factor',
                            'economic_indicator', 'marketing_spend', 'month', 'quarter']]
        y = self.raw_data['total_sales']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Create preprocessing pipeline
        preprocessor = self.create_preprocessing_pipeline()

        return X_train, X_test, y_train, y_test, preprocessor

    def train_model(self, X_train, y_train, preprocessor):
        # Create full pipeline with preprocessing and model
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
        ])

        # Fit the pipeline
        pipeline.fit(X_train, y_train)
        return pipeline

    def evaluate_model(self, pipeline, X_test, y_test):
        # Predict and score
        predictions = pipeline.predict(X_test)
        score = pipeline.score(X_test, y_test)

        # Plotting
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, predictions)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.title('Actual vs Predicted Sales')
        plt.xlabel('Actual Sales')
        plt.ylabel('Predicted Sales')
        plt.savefig('output/sales_prediction_scatter.png')
        plt.close()

        return score

    def plot_seasonality(self):
        # Seasonal decomposition
        decomposition = seasonal_decompose(self.raw_data['total_sales'], period=12)

        plt.figure(figsize=(12, 8))
        plt.subplot(411)
        plt.title('Seasonal Decomposition of Sales Data')
        plt.plot(decomposition.observed)
        plt.subplot(412)
        plt.plot(decomposition.trend)
        plt.subplot(413)
        plt.plot(decomposition.seasonal)
        plt.subplot(414)
        plt.plot(decomposition.resid)
        plt.tight_layout()
        plt.savefig('output/sales_seasonality_decomposition.png')
        plt.close()

def main():
    forecast = DemandForecasting()

    # Preprocess data
    X_train, X_test, y_train, y_test, preprocessor = forecast.preprocess_data()

    # Train model
    pipeline = forecast.train_model(X_train, y_train, preprocessor)

    # Evaluate model
    model_performance = forecast.evaluate_model(pipeline, X_test, y_test)
    print(f"Model R² Score: {model_performance}")

    # Plot seasonality
    forecast.plot_seasonality()

if __name__ == "__main__":
    main()

Model R² Score: 0.8263391802397241
