# **Demo: Techniques for Constructing Accurate Forecasting Models**

## **Step 1: Load the Dataset**

In [3]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

# Connection string components
server = 'DESKTOP'                 # Server name
database = 'Transactions'          # Database name
driver= 'SQL Server'                   

# SQLAlchemy connection string
connection_string = f'mssql+pyodbc://{server}/{database}?driver={driver}&trusted_connection=yes'

# Create the engine
engine = create_engine(connection_string)

query = '''
SELECT transaction_id, customer_id, 
       CAST(date AS DATE) as date, 
       CAST(time AS TIME) as time, 
       product_name, category, quantity, price
FROM dbo.Transactions
'''

# Use the engine to connect and execute the query
df = pd.read_sql_query(query, engine)

## **Step 2: Feature Engineering**

In [4]:
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
df['day_of_week'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour
df['sin_hour'] = np.sin(2 * np.pi * df['hour']/24.0)
df['cos_hour'] = np.cos(2 * np.pi * df['hour']/24.0)

features = ['customer_id', 'product_name', 'category', 'quantity', 'day_of_week', 'sin_hour', 'cos_hour']
X = df[features]
y = df['price']

# Adding polynomial features for quantity and encoding cyclical hour as sine and cosine
categorical_features = ['customer_id', 'product_name', 'category', 'day_of_week']
numerical_features = ['quantity', 'sin_hour', 'cos_hour']

## **Step 3:** **Feature Preprocessing**

In [5]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', PolynomialFeatures(degree=2, include_bias=False), numerical_features)
])

## **Step 4:** **Splitting Data**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

## **Step 5:** **Initialize Model**

In [7]:
model = RandomForestRegressor(n_estimators=100, random_state=44)

## **Step 6:** **Pipeline Creation**

In [8]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', model)])

## **Step 7:** **Hyperparameter Tuning**

In [9]:
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None]
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Best parameters: {'regressor__max_depth': 10, 'regressor__n_estimators': 200}


## **Step 8:** **Model Prediction**

In [10]:
# Predicting on the test set with the best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

## **Step 9:** **Evaluation**

In [11]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Absolute Error: 0.6605000020861626
Mean Squared Error: 4.517302517137527
R^2 Score: 0.9999351223356407
