In [None]:
!pip install pytorch-forecasting pytorch-lightning optuna dagshub mlflow==2.2.2
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install pytorch-lightning==1.9.5 pytorch-forecasting

Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import sys
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger
import pickle
import os
import mlflow
import mlflow.pytorch
import dagshub
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

print("All packages imported successfully")


All packages imported successfully


In [None]:
os.environ['MLFLOW_TRACKING_USERNAME'] = 'nipkha21'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '202fb8a4c58a90b0eb3598b1037498eb6fe9f593'

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/TomC333/ml-walmart-recruiting.mlflow')

print("MLflow setup completed")

experiment_name = "TFT_Training"
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

mlflow.set_experiment(experiment_name)
print(f"Experiment set: {experiment_name}")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=cf9435b1-2864-4037-9cdf-477ad232d17c&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=c78af72ee203c48170db5733e2ace5b289ee00bcd1ce90e5eac92cb79cdfd383




MLflow setup completed
Experiment set: TFT_Training


In [None]:
print("Loading raw CSV files...")

features_data = pd.read_csv('features.csv')
train_data = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')

print(f"Features data shape: {features_data.shape}")
print(f"Train data shape: {train_data.shape}")
print(f"Stores data shape: {stores.shape}")

Loading raw CSV files...
Features data shape: (8190, 12)
Train data shape: (421570, 5)
Stores data shape: (45, 3)


In [None]:
print("Merging datasets...")

df = train_data.merge(features_data, on=['Store', 'Date'], how='inner').merge(stores, on=['Store'], how='inner')

df.drop(['IsHoliday_y'], axis=1, inplace=True)
df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

print(f"Merged data shape: {df.shape}")

Merging datasets...
Merged data shape: (421570, 16)


In [None]:
print("Creating holiday features...")

df['Is_SuperBowl'] = np.where(df['Date'].isin(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08']), 1, 0)
df['Is_LaborDay'] = np.where(df['Date'].isin(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06']), 1, 0)
df['Is_Thanksgiving'] = np.where(df['Date'].isin(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29']), 1, 0)
df['Is_Christmas'] = np.where(df['Date'].isin(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']), 1, 0)

df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Store', 'Dept', 'Date'])

print("Holiday features created")

Creating holiday features...
Holiday features created


In [None]:
print("Creating lag features...")

lags = [1, 2, 3, 4, 52]

for lag in lags:
    df[f'Weekly_Sales_lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)

df['Sales_diff1'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].diff()

print("Lag features created")

Creating lag features...
Lag features created


In [None]:
print("Creating time features...")

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.isocalendar().week
df['Quarter'] = df['Date'].dt.quarter

print("Time features created")

Creating time features...
Time features created


In [None]:
print("Creating external factor features...")

for col in ['Fuel_Price', 'CPI', 'Unemployment']:
    if col in df.columns:
        df[f'{col}_change'] = df.groupby(['Store', 'Dept'])[col].diff()
        df[f'{col}_pct_change'] = df.groupby(['Store', 'Dept'])[col].pct_change()

for lag in [1, 2, 4, 8, 12]:
    for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']:
        if col in df.columns:
            df[f'{col}_lag_{lag}'] = df.groupby(['Store', 'Dept'])[col].shift(lag)

print("External factor features created")

Creating external factor features...
External factor features created


In [None]:
print("Creating outlier features...")

df['is_outlier'] = ((df['Weekly_Sales'] < -25108.67) | (df['Weekly_Sales'] > 847494.61)).astype(int)
outlier_propensity_store = df.groupby('Store')['is_outlier'].mean()
outlier_propensity_dept = df.groupby('Dept')['is_outlier'].mean()
df['store_outlier_propensity'] = df['Store'].map(outlier_propensity_store)
df['dept_outlier_propensity'] = df['Dept'].map(outlier_propensity_dept)
df['is_outlier_lag1'] = df.groupby(['Store', 'Dept'])['is_outlier'].shift(1)
df['outlier_count_last_4weeks'] = df.groupby(['Store', 'Dept'])['is_outlier'].rolling(4).sum().shift(1).reset_index(level=[0,1], drop=True)
df.drop('is_outlier', axis=1, inplace=True)

print("Outlier features created")
print(f"Final feature engineering shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Stores: {df['Store'].nunique()}, Departments: {df['Dept'].nunique()}")

Creating outlier features...
Outlier features created
Final feature engineering shape: (421570, 60)
Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
Stores: 45, Departments: 81


In [None]:
print("Handling missing values...")

with mlflow.start_run(run_name="TFT_Data_Cleaning"):
    mlflow.log_param("preprocessing_step", "missing_value_handling")

    markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
    print("Dropping markdown columns with >90% missing values:")
    for col in markdown_cols:
        if col in df.columns:
            missing_pct = df[col].isnull().sum() / len(df) * 100
            print(f"{col}: {missing_pct:.1f}% missing")
            df = df.drop(columns=[col])

    print("Markdown columns dropped")

    print("2. Fixing lag feature missingness:")

    lag_cols = [col for col in df.columns if 'lag' in col.lower()]
    for col in lag_cols:
        if col in df.columns:
            df_sorted = df.sort_values(['Store', 'Dept', 'Date'])
            missing_pattern = df_sorted.groupby(['Store', 'Dept'])[col].apply(
                lambda x: x.isnull().sum()
            )

            expected_missing = int(col.split('_')[-1]) if col.split('_')[-1].isdigit() else 1
            problematic_series = (missing_pattern > expected_missing * 2).sum()

            if problematic_series > 0:
                print(f"{col}: {problematic_series} series with unexpected missing pattern")
            else:
                print(f"{col}: Missing pattern looks normal")

    print("3. Handling external factor derived features:")

    external_derived_cols = [col for col in df.columns if any(x in col for x in ['change', 'pct_change'])
                            and any(y in col for y in ['Fuel_Price', 'CPI', 'Unemployment', 'Temperature'])]

    for col in external_derived_cols:
        if col in df.columns:
            missing_count = df[col].isnull().sum()
            if missing_count > 0:
                print(f"{col}: {missing_count} missing values")

                if 'change' in col:
                    df[col] = df.groupby(['Store', 'Dept'])[col].ffill()
                    df[col] = df[col].fillna(0)
                    print(f"Fixed {col}")

    print("4. Checking outlier features:")
    outlier_cols = [col for col in df.columns if 'outlier' in col.lower()]
    for col in outlier_cols:
        if col in df.columns:
            missing_count = df[col].isnull().sum()
            if missing_count > 0:
                print(f"{col}: {missing_count} missing - needs attention")
                df[col] = df[col].fillna(0)
            else:
                print(f"{col}: No missing values")

    print("5. Final missing value cleanup:")

    high_missing_cols = []
    for col in df.columns:
        missing_pct = df[col].isnull().sum() / len(df) * 100
        if missing_pct > 50:
            high_missing_cols.append(col)

    if high_missing_cols:
        print(f"Dropping columns with >50% missing: {high_missing_cols}")
        df = df.drop(columns=high_missing_cols)

    remaining_missing = df.isnull().sum()
    remaining_missing = remaining_missing[remaining_missing > 0]

    if len(remaining_missing) > 0:
        print("Remaining missing values:")
        print(remaining_missing)

        for col in remaining_missing.index:
            if df[col].dtype in ['int64', 'float64']:
                df[col] = df.groupby(['Store', 'Dept'])[col].ffill()
                df[col] = df.groupby(['Store', 'Dept'])[col].bfill()
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df.groupby(['Store', 'Dept'])[col].ffill()
                df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown')

            print(f"Fixed {col}")

    print("6. Final verification:")
    final_missing = df.isnull().sum().sum()
    if final_missing == 0:
        print("All missing values handled successfully!")
    else:
        print(f"{final_missing} missing values still remain")
        print(df.isnull().sum()[df.isnull().sum() > 0])

    print(f"Final dataset shape: {df.shape}")

    mlflow.log_metric("total_observations", len(df))
    mlflow.log_metric("total_stores", df['Store'].nunique())
    mlflow.log_metric("total_departments", df['Dept'].nunique())
    mlflow.log_metric("total_features", len(df.columns))
    mlflow.log_metric("missing_values_final", final_missing)

print("Data preprocessing completed successfully!")
def prepare_tft_data(df):
    """Prepare data specifically for TFT requirements"""

    df_tft = df.sort_values(['Store', 'Dept', 'Date']).copy()

    start_date = df_tft['Date'].min()
    df_tft['time_idx'] = (df_tft['Date'] - start_date).dt.days // 7

    df_tft['group_id'] = df_tft['Store'].astype(str) + '_' + df_tft['Dept'].astype(str)

    le_store = LabelEncoder()
    le_dept = LabelEncoder()
    le_type = LabelEncoder()

    df_tft['Store_encoded'] = le_store.fit_transform(df_tft['Store'])
    df_tft['Dept_encoded'] = le_dept.fit_transform(df_tft['Dept'])
    df_tft['Type_encoded'] = le_type.fit_transform(df_tft['Type'])

    df_tft['Weekly_Sales'] = np.maximum(df_tft['Weekly_Sales'], 0)

    static_categoricals = ['Store_encoded', 'Dept_encoded', 'Type_encoded']

    static_reals = ['Size', 'store_outlier_propensity', 'dept_outlier_propensity']

    time_varying_categoricals = ['IsHoliday', 'Is_SuperBowl', 'Is_LaborDay',
                                'Is_Thanksgiving', 'Is_Christmas', 'Year', 'Month', 'Quarter']

    time_varying_reals_known = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                               'Week', 'time_idx']

    time_varying_reals_unknown = ['Weekly_Sales_lag_1', 'Weekly_Sales_lag_2',
                             'Weekly_Sales_lag_3', 'Weekly_Sales_lag_4', 'Weekly_Sales_lag_52',
                             'Temperature_lag_1', 'Fuel_Price_lag_1', 'CPI_lag_1', 'Unemployment_lag_1',
                             'Fuel_Price_change', 'CPI_change', 'Unemployment_change',
                             'is_outlier_lag1', 'outlier_count_last_4weeks']

    df_tft['Store_encoded'] = df_tft['Store_encoded'].astype(str)
    df_tft['Dept_encoded'] = df_tft['Dept_encoded'].astype(str)
    df_tft['Type_encoded'] = df_tft['Type_encoded'].astype(str)

    for col in time_varying_categoricals:
        if col in df_tft.columns:
            if df_tft[col].dtype == 'bool':
                df_tft[col] = df_tft[col].astype(int).astype(str)
            else:
                df_tft[col] = df_tft[col].astype(str)

    group_counts = df_tft.groupby('group_id').size()
    valid_groups = group_counts[group_counts >= 30].index
    df_tft = df_tft[df_tft['group_id'].isin(valid_groups)]

    if 'Sales_diff1' in df_tft.columns:
        df_tft = df_tft.drop('Sales_diff1', axis=1)

    print(f"Data prepared for TFT. Shape: {df_tft.shape}")
    print(f"Valid groups: {len(valid_groups)}")
    print(f"Time range: {df_tft['time_idx'].min()} to {df_tft['time_idx'].max()}")

    return df_tft, static_categoricals, static_reals, time_varying_categoricals, time_varying_reals_known, time_varying_reals_unknown

Handling missing values...
Dropping markdown columns with >90% missing values:
Markdown columns dropped
2. Fixing lag feature missingness:
Weekly_Sales_lag_1: Missing pattern looks normal
Weekly_Sales_lag_2: Missing pattern looks normal
Weekly_Sales_lag_3: Missing pattern looks normal
Weekly_Sales_lag_4: Missing pattern looks normal
Weekly_Sales_lag_52: Missing pattern looks normal
Temperature_lag_1: Missing pattern looks normal
Fuel_Price_lag_1: Missing pattern looks normal
CPI_lag_1: Missing pattern looks normal
Unemployment_lag_1: Missing pattern looks normal
Temperature_lag_2: Missing pattern looks normal
Fuel_Price_lag_2: Missing pattern looks normal
CPI_lag_2: Missing pattern looks normal
Unemployment_lag_2: Missing pattern looks normal
Temperature_lag_4: Missing pattern looks normal
Fuel_Price_lag_4: Missing pattern looks normal
CPI_lag_4: Missing pattern looks normal
Unemployment_lag_4: Missing pattern looks normal
Temperature_lag_8: Missing pattern looks normal
Fuel_Price_lag_

In [None]:
with mlflow.start_run(run_name="TFT_Data_Preparation"):
    mlflow.log_param("preprocessing_step", "data_preparation")

    df_tft, static_categoricals, static_reals, time_varying_categoricals, time_varying_reals_known, time_varying_reals_unknown = prepare_tft_data(df)

    mlflow.log_metric("total_groups", df_tft['group_id'].nunique())
    mlflow.log_metric("total_observations", len(df_tft))
    mlflow.log_metric("avg_observations_per_group", len(df_tft) / df_tft['group_id'].nunique())

    print("Data preparation completed and logged to MLflow")

Data prepared for TFT. Shape: (419022, 59)
Valid groups: 3053
Time range: 0 to 142
Data preparation completed and logged to MLflow


In [None]:
def create_tft_dataset(df_tft, static_categoricals, static_reals, time_varying_categoricals,
                      time_varying_reals_known, time_varying_reals_unknown,
                      max_encoder_length=8, max_prediction_length=1):
    """Create TFT dataset with optimized parameters"""

    available_static_cat = [col for col in static_categoricals if col in df_tft.columns]
    available_static_real = [col for col in static_reals if col in df_tft.columns]
    available_time_cat = [col for col in time_varying_categoricals if col in df_tft.columns]
    available_time_known = [col for col in time_varying_reals_known if col in df_tft.columns]
    available_time_unknown = [col for col in time_varying_reals_unknown if col in df_tft.columns]

    max_time_idx = df_tft['time_idx'].max()
    training_cutoff = max_time_idx - max_prediction_length

    training = TimeSeriesDataSet(
        df_tft[df_tft['time_idx'] <= training_cutoff],
        time_idx='time_idx',
        target='Weekly_Sales',
        group_ids=['group_id'],
        min_encoder_length=max_encoder_length // 2,
        max_encoder_length=max_encoder_length,
        min_prediction_length=1,
        max_prediction_length=max_prediction_length,
        static_categoricals=available_static_cat,
        static_reals=available_static_real,
        time_varying_known_categoricals=available_time_cat,
        time_varying_known_reals=available_time_known,
        time_varying_unknown_reals=available_time_unknown,
        target_normalizer=GroupNormalizer(
            groups=['group_id'], transformation="softplus"
        ),
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        allow_missing_timesteps=True
    )

    validation = TimeSeriesDataSet.from_dataset(training, df_tft, predict=True, stop_randomization=True)

    print(f"Training dataset: {len(training)} samples")
    print(f"Validation dataset: {len(validation)} samples")

    return training, validation

In [None]:
with mlflow.start_run(run_name="TFT_Dataset_Creation"):
    mlflow.log_param("preprocessing_step", "dataset_creation")
    mlflow.log_param("max_encoder_length", 8)
    mlflow.log_param("max_prediction_length", 1)

    training, validation = create_tft_dataset(
        df_tft, static_categoricals, static_reals, time_varying_categoricals,
        time_varying_reals_known, time_varying_reals_unknown,
        max_encoder_length=8, max_prediction_length=1
    )

    batch_size = 128
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=4)
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 2, num_workers=4)

    mlflow.log_param("batch_size", batch_size)
    mlflow.log_metric("train_batches", len(train_dataloader))
    mlflow.log_metric("val_batches", len(val_dataloader))

    print("Datasets and dataloaders created successfully")


Training dataset: 418352 samples
Validation dataset: 3024 samples
Datasets and dataloaders created successfully


In [None]:
def create_tft_model(training_dataset, learning_rate=0.001):
    """Create TFT model with optimized hyperparameters for performance"""

    model = TemporalFusionTransformer.from_dataset(
        training_dataset,
        hidden_size=32,
        attention_head_size=1,
        dropout=0.1,
        hidden_continuous_size=16,
        loss=QuantileLoss(),
        learning_rate=learning_rate,
        weight_decay=1e-2,
        log_interval=50,
        log_val_interval=10,
        reduce_on_plateau_patience=3,
    )

    return model

In [None]:
with mlflow.start_run(run_name="TFT_Phase1_Baseline"):
    mlflow.log_param("phase", "Phase1_Baseline")
    mlflow.log_param("model_type", "TFT")
    mlflow.log_param("feature_set", "minimal")

    print("Phase 1: Training baseline TFT model...")

    tft_model = create_tft_model(training, learning_rate=0.001)
    print(f"Model type: {type(tft_model)}")
    print(f"Is LightningModule: {isinstance(tft_model, pl.LightningModule)}")
    mlflow.log_param("hidden_size", 32)
    mlflow.log_param("attention_head_size", 1)
    mlflow.log_param("dropout", 0.1)
    mlflow.log_param("learning_rate", 0.001)

    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=1e-4,
        patience=5,
        verbose=False,
        mode="min"
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        verbose=False
    )

    trainer = pl.Trainer(
        max_epochs=25,
        accelerator="gpu",
        devices=1,
        gradient_clip_val=0.1,
        callbacks=[early_stop_callback, checkpoint_callback],
        enable_checkpointing=True,
        logger=False,
    )

    try:
      trainer.fit(tft_model, train_dataloader, val_dataloader)
    except Exception as e:
      print(f"Error with direct fit: {e}")
      trainer.fit(
          model=tft_model,
          train_dataloaders=train_dataloader,
          val_dataloaders=val_dataloader,
      )

    best_model_path = checkpoint_callback.best_model_path
    best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

    val_predictions = best_tft.predict(val_dataloader, return_y=True)


Phase 1: Training baseline TFT model...
Model type: <class 'pytorch_forecasting.models.temporal_fusion_transformer._tft.TemporalFusionTransformer'>
Is LightningModule: True


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:lightning.pytorch.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for p

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AttributeError: 'Tensor' object has no attribute 'prediction'

In [None]:
y_true = val_predictions.y[0].cpu().numpy()
if hasattr(val_predictions.output, 'prediction'):
    y_pred = val_predictions.output.prediction.cpu().numpy()
else:
    y_pred = val_predictions.output.cpu().numpy()

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

mlflow.log_metric("val_mae", mae)
mlflow.log_metric("val_rmse", rmse)
mlflow.log_metric("epochs_trained", trainer.current_epoch)

mlflow.pytorch.log_model(best_tft, "tft_model")

print(f"Phase 1 Results - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

phase1_results = {"mae": mae, "rmse": rmse, "model": best_tft}



Phase 1 Results - MAE: 2039.10, RMSE: 4102.40


In [None]:
print("Loading Phase 1 results from previous run...")

phase1_run_id = "cb450db0a58948dc98b432b7d3eb65b6"

try:
    phase1_model = mlflow.pytorch.load_model(f"runs:/{phase1_run_id}/tft_model")

    phase1_run = mlflow.get_run(phase1_run_id)
    phase1_mae = phase1_run.data.metrics.get('val_mae', None)
    phase1_rmse = phase1_run.data.metrics.get('val_rmse', None)

    if phase1_mae is None or phase1_rmse is None:
        print("Warning: Phase 1 metrics not found in MLflow, will re-evaluate...")
        val_predictions_phase1 = phase1_model.predict(val_dataloader, return_y=True)
        y_true_phase1 = val_predictions_phase1.y[0].cpu().numpy()
        if hasattr(val_predictions_phase1.output, 'prediction'):
            y_pred_phase1 = val_predictions_phase1.output.prediction.cpu().numpy()
        else:
            y_pred_phase1 = val_predictions_phase1.output.cpu().numpy()

        phase1_mae = mean_absolute_error(y_true_phase1, y_pred_phase1)
        phase1_rmse = np.sqrt(mean_squared_error(y_true_phase1, y_pred_phase1))

    phase1_results = {"mae": phase1_mae, "rmse": phase1_rmse, "model": phase1_model}
    print(f"✅ Phase 1 loaded: MAE = {phase1_mae:.2f}, RMSE = {phase1_rmse:.2f}")

except Exception as e:
    print(f"❌ Error loading Phase 1: {e}")
    print("You'll need to re-run Phase 1 or check the run_id")
    phase1_results = None

Loading Phase 1 results from previous run...
✅ Phase 1 loaded: MAE = 2039.10, RMSE = 4102.40


In [None]:
mlflow.end_run()

In [None]:
with mlflow.start_run(run_name="TFT_Phase2_Enhanced_Features"):
    mlflow.log_param("phase", "Phase2_Enhanced_Features")
    mlflow.log_param("model_type", "TFT")
    mlflow.log_param("feature_set", "enhanced_with_lags")

    print("Phase 2: Training TFT with enhanced features...")

    tft_model_enhanced = create_tft_model(training, learning_rate=0.0008)

    mlflow.log_param("hidden_size", 32)
    mlflow.log_param("learning_rate", 0.0008)
    mlflow.log_param("features_used", "all_available")

    trainer_enhanced = pl.Trainer(
        max_epochs=20,
        accelerator="gpu",
        devices=1,
        gradient_clip_val=0.1,
        callbacks=[early_stop_callback, checkpoint_callback],
        enable_checkpointing=True,
        logger=False,
    )

    trainer_enhanced.fit(
        tft_model_enhanced,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )

    best_model_path_enhanced = checkpoint_callback.best_model_path
    best_tft_enhanced = TemporalFusionTransformer.load_from_checkpoint(best_model_path_enhanced)

    val_predictions_enhanced = best_tft_enhanced.predict(val_dataloader, return_y=True)

    y_true_enhanced = val_predictions_enhanced.y[0].cpu().numpy()
    if hasattr(val_predictions_enhanced.output, 'prediction'):
        y_pred_enhanced = val_predictions_enhanced.output.prediction.cpu().numpy()
    else:
        y_pred_enhanced = val_predictions_enhanced.output.cpu().numpy()

    mae_enhanced = mean_absolute_error(y_true_enhanced, y_pred_enhanced)
    rmse_enhanced = np.sqrt(mean_squared_error(y_true_enhanced, y_pred_enhanced))

    mlflow.log_metric("val_mae", mae_enhanced)
    mlflow.log_metric("val_rmse", rmse_enhanced)
    mlflow.log_metric("epochs_trained", trainer_enhanced.current_epoch)

    mlflow.pytorch.log_model(best_tft_enhanced, "tft_model")

    print(f"Phase 2 Results - MAE: {mae_enhanced:.2f}, RMSE: {rmse_enhanced:.2f}")

    phase2_results = {"mae": mae_enhanced, "rmse": rmse_enhanced, "model": best_tft_enhanced}

Phase 2: Training TFT with enhanced features...


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
   | Name                               | Type                            | Params | Mode 
------------------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0      | train
1  | logging_metrics                    | ModuleList                      | 0      | train
2  | input_embeddings                   | MultiEmbedding                  | 2.1 K  | train
3  | prescalers       

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Phase 2 Results - MAE: 2039.10, RMSE: 4102.40


In [None]:
with mlflow.start_run(run_name="TFT_Phase3_Optimized"):
    mlflow.log_param("phase", "Phase3_Optimized")
    mlflow.log_param("model_type", "TFT")
    mlflow.log_param("feature_set", "optimized_architecture")

    print("Phase 3: Training TFT with optimized architecture...")

    tft_model_optimized = TemporalFusionTransformer.from_dataset(
        training,
        hidden_size=64,
        attention_head_size=2,
        dropout=0.15,
        hidden_continuous_size=32,
        loss=QuantileLoss(),
        learning_rate=0.0005,
        weight_decay=1e-2,
        log_interval=50,
        reduce_on_plateau_patience=4,
    )

    mlflow.log_param("hidden_size", 64)
    mlflow.log_param("attention_head_size", 2)
    mlflow.log_param("learning_rate", 0.0005)
    mlflow.log_param("dropout", 0.15)

    trainer_optimized = pl.Trainer(
        max_epochs=25,
        accelerator="gpu",
        devices=1,
        gradient_clip_val=0.1,
        callbacks=[early_stop_callback, checkpoint_callback],
        enable_checkpointing=True,
        logger=False,
    )

    trainer_optimized.fit(
        tft_model_optimized,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )

    best_model_path_optimized = checkpoint_callback.best_model_path
    best_tft_optimized = TemporalFusionTransformer.load_from_checkpoint(best_model_path_optimized)

    val_predictions_optimized = best_tft_optimized.predict(val_dataloader, return_y=True)

    y_true_optimized = val_predictions_optimized.y[0].cpu().numpy()
    if hasattr(val_predictions_optimized.output, 'prediction'):
        y_pred_optimized = val_predictions_optimized.output.prediction.cpu().numpy()
    else:
        y_pred_optimized = val_predictions_optimized.output.cpu().numpy()

    mae_optimized = mean_absolute_error(y_true_optimized, y_pred_optimized)
    rmse_optimized = np.sqrt(mean_squared_error(y_true_optimized, y_pred_optimized))

    mlflow.log_metric("val_mae", mae_optimized)
    mlflow.log_metric("val_rmse", rmse_optimized)
    mlflow.log_metric("epochs_trained", trainer_optimized.current_epoch)

    mlflow.pytorch.log_model(best_tft_optimized, "tft_model")

    print(f"Phase 3 Results - MAE: {mae_optimized:.2f}, RMSE: {rmse_optimized:.2f}")

    phase3_results = {"mae": mae_optimized, "rmse": rmse_optimized, "model": best_tft_optimized}



Phase 3: Training TFT with optimized architecture...


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
   | Name                               | Type                            | Params | Mode 
------------------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0      | train
1  | logging_metrics                    | ModuleList                      | 0      | train
2  | input_embeddings                   | MultiEmbedding                  | 2.1 K  | train
3  | prescalers       

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Phase 3 Results - MAE: 2039.10, RMSE: 4102.40


In [None]:
with mlflow.start_run(run_name="TFT_Phase4_Aggressive_Learning"):
    mlflow.log_param("phase", "Phase4_Aggressive_Learning")
    mlflow.log_param("model_type", "TFT")
    mlflow.log_param("feature_set", "aggressive_hyperparams")

    print("Phase 4: Training TFT with aggressive hyperparameters...")

    tft_model_aggressive = TemporalFusionTransformer.from_dataset(
        training,
        hidden_size=128,
        attention_head_size=4,
        dropout=0.3,
        hidden_continuous_size=64,
        loss=QuantileLoss(),
        learning_rate=0.01,
        weight_decay=1e-3,
        log_interval=50,
        reduce_on_plateau_patience=2,
    )

    mlflow.log_param("hidden_size", 128)
    mlflow.log_param("learning_rate", 0.01)
    mlflow.log_param("dropout", 0.3)
    mlflow.log_param("attention_head_size", 4)

    early_stop_aggressive = EarlyStopping(
        monitor="val_loss",
        min_delta=1e-3,
        patience=15,
        verbose=True,
        mode="min"
    )

    checkpoint_aggressive = ModelCheckpoint(
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        verbose=True
    )

    trainer_aggressive = pl.Trainer(
        max_epochs=50,
        accelerator="gpu",
        devices=1,
        gradient_clip_val=1.0,
        callbacks=[early_stop_aggressive, checkpoint_aggressive],
        enable_checkpointing=True,
        logger=False,
    )

    trainer_aggressive.fit(
        tft_model_aggressive,
        train_dataloader,
        val_dataloader,
    )

    best_model_path_aggressive = checkpoint_aggressive.best_model_path
    best_tft_aggressive = TemporalFusionTransformer.load_from_checkpoint(best_model_path_aggressive)

    val_predictions_aggressive = best_tft_aggressive.predict(val_dataloader, return_y=True)

    y_true_aggressive = val_predictions_aggressive.y[0].cpu().numpy()
    if hasattr(val_predictions_aggressive.output, 'prediction'):
        y_pred_aggressive = val_predictions_aggressive.output.prediction.cpu().numpy()
    else:
        y_pred_aggressive = val_predictions_aggressive.output.cpu().numpy()

    mae_aggressive = mean_absolute_error(y_true_aggressive, y_pred_aggressive)
    rmse_aggressive = np.sqrt(mean_squared_error(y_true_aggressive, y_pred_aggressive))

    mlflow.log_metric("val_mae", mae_aggressive)
    mlflow.log_metric("val_rmse", rmse_aggressive)
    mlflow.log_metric("epochs_trained", trainer_aggressive.current_epoch)

    mlflow.pytorch.log_model(best_tft_aggressive, "tft_model")

    print(f"Phase 4 Results - MAE: {mae_aggressive:.2f}, RMSE: {rmse_aggressive:.2f}")

    phase4_results = {"mae": mae_aggressive, "rmse": rmse_aggressive, "model": best_tft_aggressive}

mlflow.end_run()

Phase 4: Training TFT with aggressive hyperparameters...


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
   | Name                               | Type                            | Params | Mode 
------------------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0      | train
1  | logging_metrics                    | ModuleList                      | 0      | train
2  | input_embeddings                   | MultiEmbedding                  | 2.1 K  | train
3  | prescalers       

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved. New best score: 1162.591
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved. New best score: 1162.591
INFO: Epoch 0, global step 3268: 'val_loss' reached 1162.59070 (best 1162.59070), saving model to '/content/checkpoints/epoch=0-step=3268.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 0, global step 3268: 'val_loss' reached 1162.59070 (best 1162.59070), saving model to '/content/checkpoints/epoch=0-step=3268.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved by 80.146 >= min_delta = 0.001. New best score: 1082.445
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved by 80.146 >= min_delta = 0.001. New best score: 1082.445
INFO: Epoch 1, global step 6536: 'val_loss' reached 1082.44495 (best 1082.44495), saving model to '/content/checkpoints/epoch=1-step=6536-v1.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 1, global step 6536: 'val_loss' reached 1082.44495 (best 1082.44495), saving model to '/content/checkpoints/epoch=1-step=6536-v1.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved by 62.017 >= min_delta = 0.001. New best score: 1020.428
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved by 62.017 >= min_delta = 0.001. New best score: 1020.428
INFO: Epoch 2, global step 9804: 'val_loss' reached 1020.42798 (best 1020.42798), saving model to '/content/checkpoints/epoch=2-step=9804.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 2, global step 9804: 'val_loss' reached 1020.42798 (best 1020.42798), saving model to '/content/checkpoints/epoch=2-step=9804.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved by 101.290 >= min_delta = 0.001. New best score: 919.138
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved by 101.290 >= min_delta = 0.001. New best score: 919.138
INFO: Epoch 3, global step 13072: 'val_loss' reached 919.13812 (best 919.13812), saving model to '/content/checkpoints/epoch=3-step=13072.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 3, global step 13072: 'val_loss' reached 919.13812 (best 919.13812), saving model to '/content/checkpoints/epoch=3-step=13072.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 4, global step 16340: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 4, global step 16340: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 5, global step 19608: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 5, global step 19608: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 6, global step 22876: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 6, global step 22876: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 7, global step 26144: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 7, global step 26144: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved by 21.855 >= min_delta = 0.001. New best score: 897.284
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved by 21.855 >= min_delta = 0.001. New best score: 897.284
INFO: Epoch 8, global step 29412: 'val_loss' reached 897.28357 (best 897.28357), saving model to '/content/checkpoints/epoch=8-step=29412.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 8, global step 29412: 'val_loss' reached 897.28357 (best 897.28357), saving model to '/content/checkpoints/epoch=8-step=29412.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 9, global step 32680: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 9, global step 32680: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 10, global step 35948: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 10, global step 35948: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 11, global step 39216: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 11, global step 39216: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved by 68.669 >= min_delta = 0.001. New best score: 828.615
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved by 68.669 >= min_delta = 0.001. New best score: 828.615
INFO: Epoch 12, global step 42484: 'val_loss' reached 828.61481 (best 828.61481), saving model to '/content/checkpoints/epoch=12-step=42484.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 12, global step 42484: 'val_loss' reached 828.61481 (best 828.61481), saving model to '/content/checkpoints/epoch=12-step=42484.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 13, global step 45752: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 13, global step 45752: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 14, global step 49020: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 14, global step 49020: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 15, global step 52288: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 15, global step 52288: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 16, global step 55556: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 16, global step 55556: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 17, global step 58824: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 17, global step 58824: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Metric val_loss improved by 4.611 >= min_delta = 0.001. New best score: 824.003
INFO:lightning.pytorch.callbacks.early_stopping:Metric val_loss improved by 4.611 >= min_delta = 0.001. New best score: 824.003
INFO: Epoch 18, global step 62092: 'val_loss' reached 824.00342 (best 824.00342), saving model to '/content/checkpoints/epoch=18-step=62092.ckpt' as top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 18, global step 62092: 'val_loss' reached 824.00342 (best 824.00342), saving model to '/content/checkpoints/epoch=18-step=62092.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 19, global step 65360: 'val_loss' was not in top 1
INFO:lightning.pytorch.utilities.rank_zero:Epoch 19, global step 65360: 'val_loss' was not in top 1
INFO: 
Detected KeyboardInterrupt, attempting graceful shutdown ...
INFO:lightning.pytorch.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
print("Completing Phase 4 evaluation after manual stop...")

best_model_path_aggressive = checkpoint_aggressive.best_model_path
best_tft_aggressive = TemporalFusionTransformer.load_from_checkpoint(best_model_path_aggressive)

val_predictions_aggressive = best_tft_aggressive.predict(val_dataloader, return_y=True)

y_true_aggressive = val_predictions_aggressive.y[0].cpu().numpy()
if hasattr(val_predictions_aggressive.output, 'prediction'):
    y_pred_aggressive = val_predictions_aggressive.output.prediction.cpu().numpy()
else:
    y_pred_aggressive = val_predictions_aggressive.output.cpu().numpy()

mae_aggressive = mean_absolute_error(y_true_aggressive, y_pred_aggressive)
rmse_aggressive = np.sqrt(mean_squared_error(y_true_aggressive, y_pred_aggressive))

mlflow.log_metric("val_mae", mae_aggressive)
mlflow.log_metric("val_rmse", rmse_aggressive)
mlflow.log_metric("epochs_trained", trainer_aggressive.current_epoch)

mlflow.pytorch.log_model(best_tft_aggressive, "tft_model")

print(f"Phase 4 Results - MAE: {mae_aggressive:.2f}, RMSE: {rmse_aggressive:.2f}")

phase4_results = {"mae": mae_aggressive, "rmse": rmse_aggressive, "model": best_tft_aggressive}

mlflow.end_run()

print("✅ Phase 4 completed successfully!")

Completing Phase 4 evaluation after manual stop...


INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Phase 4 Results - MAE: 1495.59, RMSE: 2899.15
✅ Phase 4 completed successfully!


In [None]:
all_results = {
    "Phase1_Baseline": phase1_results,
    "Phase2_Enhanced_Features": phase2_results,
    "Phase3_Optimized": phase3_results,
    "Phase4_Aggressive": phase4_results,
}

print("\nComparison of all phases:")
print("-" * 50)
best_mae = float('inf')
best_phase = None

for phase_name, results in all_results.items():
    mae = results['mae']
    rmse = results['rmse']
    print(f"{phase_name}: MAE = {mae:.2f}, RMSE = {rmse:.2f}")

    if mae < best_mae:
        best_mae = mae
        best_phase = phase_name

print(f"\nBest performing phase: {best_phase} with MAE: {best_mae:.2f}")


Comparison of all phases:
--------------------------------------------------
Phase1_Baseline: MAE = 2039.10, RMSE = 4102.40
Phase2_Enhanced_Features: MAE = 2039.10, RMSE = 4102.40
Phase3_Optimized: MAE = 2039.10, RMSE = 4102.40
Phase4_Aggressive: MAE = 1495.59, RMSE = 2899.15

Best performing phase: Phase4_Aggressive with MAE: 1495.59


In [None]:
if best_phase:
    best_model = all_results[best_phase]['model']
    best_rmse = all_results[best_phase]['rmse']

    with mlflow.start_run(run_name=f"TFT_Best_Model_{best_phase}"):
        mlflow.log_param("phase_name", best_phase)
        mlflow.log_param("model_type", "TFT")
        mlflow.log_param("is_best_model", True)

        preprocessing_config = {
            "max_encoder_length": 8,
            "max_prediction_length": 1,
            "batch_size": batch_size,
            "target_normalizer": "GroupNormalizer_softplus",
            "static_categoricals": static_categoricals,
            "static_reals": static_reals,
            "time_varying_categoricals": time_varying_categoricals,
            "time_varying_reals_known": time_varying_reals_known,
            "time_varying_reals_unknown": time_varying_reals_unknown
        }

        mlflow.log_dict(preprocessing_config, "preprocessing_config.json")

        mlflow.log_metric("best_mae", best_mae)
        mlflow.log_metric("best_rmse", best_rmse)

        mlflow.pytorch.log_model(
            best_model,
            "tft_model",
            registered_model_name="TFT_Sales_Forecasting"
        )

        with open('/tmp/training_dataset.pkl', 'wb') as f:
            pickle.dump(training, f)
        mlflow.log_artifact('/tmp/training_dataset.pkl', 'dataset')

        run_id = mlflow.active_run().info.run_id
        print(f"Best TFT model registered with run_id: {run_id}")
        print(f"Configuration: {best_phase}")
        print(f"Performance: MAE = {best_mae:.2f}, RMSE = {best_rmse:.2f}")

print("\nTFT experiment completed successfully!")
print("Model saved and registered in MLflow for inference")

Successfully registered model 'TFT_Sales_Forecasting'.
2025/08/02 11:02:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: TFT_Sales_Forecasting, version 1
Created version '1' of model 'TFT_Sales_Forecasting'.


Best TFT model registered with run_id: 8754a74108674177bc0fa5e3dfd992be
Configuration: Phase4_Aggressive
Performance: MAE = 1495.59, RMSE = 2899.15

TFT experiment completed successfully!
Model saved and registered in MLflow for inference
