## Package Installation

In [1]:
import os
import sys
import types

# Install core dependencies (includes upgrades from Task3)
!pip install easy-tpp lightning pytorch-lightning hydra-core omegaconf torchmetrics stribor -q

print("✓ Core dependencies installed")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m846.0/846.0 kB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.5/849.5 kB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h✓ Core dependencies installed


In [2]:
# Import required libraries
import os

# Set PyTorch memory allocation config BEFORE importing torch
# This helps prevent OOM errors with large models like AttNHP
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import yaml
import pandas as pd
import numpy as np
from datetime import datetime
from easy_tpp.config_factory import Config
from easy_tpp.runner import Runner
from google.colab import drive
import json
from IPython.display import display, clear_output
import time

In [3]:
# ==============================================================================
# FULLYNN GRADIENT FIX - Monkey Patch
# ==============================================================================
# FullyNN uses torch.autograd.grad() to compute intensity derivatives, but during
# validation/evaluation, EasyTPP runs under torch.no_grad() context which disables
# gradient tracking. This patch wraps the computation in torch.enable_grad().
# ==============================================================================

import torch
from easy_tpp.model.torch_model import torch_fullynn

def patched_compute_intensities_at_sample_times(self, time_seqs, time_delta_seqs, type_seqs, sample_dtimes, **kwargs):
    """Patched version that enables gradients during intensity computation.

    FullyNN requires gradient computation for intensity derivatives via autograd.grad().
    During validation, EasyTPP disables gradients with torch.no_grad(), causing failures.
    This patch wraps the computation in torch.enable_grad() to fix the issue.
    """
    compute_last_step_only = kwargs.get('compute_last_step_only', False)

    # Enable gradients for this computation even during evaluation
    with torch.enable_grad():
        # Forward pass to get hidden states
        hidden_states = self.forward(
            time_seqs=time_seqs,
            time_delta_seqs=time_delta_seqs,
            type_seqs=type_seqs,
        )

        num_samples = sample_dtimes.size()[-1]
        batch_size, seq_len, hidden_size = hidden_states.shape

        # Expand hidden states for all sample times
        hidden_states_ = hidden_states[..., None, :].expand(batch_size, seq_len, num_samples, hidden_size)

        # Clone sample_dtimes to avoid in-place modification issues
        sample_dtimes_grad = sample_dtimes.clone().detach().requires_grad_(True)

        # Compute intensities (this uses autograd.grad internally)
        _, derivative_integral_lambda = self.layer_intensity.forward(
            hidden_states=hidden_states_,
            time_delta_seqs=sample_dtimes_grad,
        )

    # Detach the result since we don't need gradients flowing back
    derivative_integral_lambda = derivative_integral_lambda.detach()

    if compute_last_step_only:
        return derivative_integral_lambda[:, -1:, :, :]
    return derivative_integral_lambda

# Apply the monkey patch
torch_fullynn.FullyNN.compute_intensities_at_sample_times = patched_compute_intensities_at_sample_times
print("✓ FullyNN patched for gradient computation during evaluation")
print("  This fixes the 'element 0 of tensors does not require grad' error")

✓ FullyNN patched for gradient computation during evaluation
  This fixes the 'element 0 of tensors does not require grad' error


In [4]:
# Mount Google Drive
drive.mount('/content/drive')

# Base directory (your Drive location)
BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/MilestoneFall2025'
DATASET_DIR = os.path.join(BASE_DIR, 'Datasets')
CHECKPOINT_DIR = os.path.join(BASE_DIR, 'checkpoints')
RESULTS_DIR = os.path.join(BASE_DIR, 'results/Task1')

# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"✓ Base Directory: {BASE_DIR}")
print(f"✓ Dataset Directory: {DATASET_DIR}")
print(f"✓ Checkpoint Directory: {CHECKPOINT_DIR}")
print(f"✓ Results Directory: {RESULTS_DIR}")

Mounted at /content/drive
✓ Base Directory: /content/drive/MyDrive/Colab Notebooks/MilestoneFall2025
✓ Dataset Directory: /content/drive/MyDrive/Colab Notebooks/MilestoneFall2025/Datasets
✓ Checkpoint Directory: /content/drive/MyDrive/Colab Notebooks/MilestoneFall2025/checkpoints
✓ Results Directory: /content/drive/MyDrive/Colab Notebooks/MilestoneFall2025/results/Task1


## Configuration Dictionary

### Dataset Configuration

In [5]:
# Data specifications for each dataset
data_spec_dict = {
    "taxi": {
        "data_format": "pkl",
        "train_dir": os.path.join(DATASET_DIR, "taxi", "train.pkl"),
        "valid_dir": os.path.join(DATASET_DIR, "taxi", "dev.pkl"),
        "test_dir": os.path.join(DATASET_DIR, "taxi", "test.pkl"),
        "data_specs": {
            "num_event_types": 10,
            "pad_token_id": 10,
            "padding_side": "right",
            "max_seq_len": 100,
            "strict_pad_leng": True
        }
    },
    "amazon": {
        "data_format": "pkl",
        "train_dir": os.path.join(DATASET_DIR, "amazon", "train.pkl"),
        "valid_dir": os.path.join(DATASET_DIR, "amazon", "dev.pkl"),
        "test_dir": os.path.join(DATASET_DIR, "amazon", "test.pkl"),
        "data_specs": {
            "num_event_types": 16,
            "pad_token_id": 16,
            "padding_side": "right",
            "max_seq_len": 100,
            "strict_pad_leng": True
        }
    },
    "taobao": {
        "data_format": "pkl",
        "train_dir": os.path.join(DATASET_DIR, "taobao", "train.pkl"),
        "valid_dir": os.path.join(DATASET_DIR, "taobao", "dev.pkl"),
        "test_dir": os.path.join(DATASET_DIR, "taobao", "test.pkl"),
        "data_specs": {
            "num_event_types": 17,
            "pad_token_id": 17,
            "padding_side": "right",
            "max_seq_len": 150,
            "strict_pad_leng": True
        }
    },
    "stackoverflow": {
        "data_format": "pkl",
        "train_dir": os.path.join(DATASET_DIR, "stackoverflow", "train.pkl"),
        "valid_dir": os.path.join(DATASET_DIR, "stackoverflow", "dev.pkl"),
        "test_dir": os.path.join(DATASET_DIR, "stackoverflow", "test.pkl"),
        "data_specs": {
            "num_event_types": 22,
            "pad_token_id": 22,
            "padding_side": "right",
            "max_seq_len": 100,
            "strict_pad_leng": True
        }
    },
    "retweet": {
        "data_format": "pkl",
        "train_dir": os.path.join(DATASET_DIR, "retweet", "train.pkl"),
        "valid_dir": os.path.join(DATASET_DIR, "retweet", "dev.pkl"),
        "test_dir": os.path.join(DATASET_DIR, "retweet", "test.pkl"),
        "data_specs": {
            "num_event_types": 3,
            "pad_token_id": 3,
            "padding_side": "right",
            "max_seq_len": 100,
            "strict_pad_leng": True
        }
    }
}

print("✓ Data specifications loaded for 5 datasets")

✓ Data specifications loaded for 5 datasets


### Model Configuration

In [6]:
# Model specifications
model_spec_dict = {
    "RMTPP": {
        "model_id": "RMTPP",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "dropout": 0.0,
        "use_ln": False,
        "loss_integral_num_sample_per_step": 20,
        "mc_num_sample_per_step": 20,
        "num_heads": 2,
        "seed": 2019,
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5
        }
    },
    "NHP": {
        "model_id": "NHP",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "dropout": 0.0,
        "use_ln": False,
        "loss_integral_num_sample_per_step": 20,
        "mc_num_sample_per_step": 20,
        "num_heads": 2,
        "seed": 2019,
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5
        }
    },
    "FullyNN": {
        "model_id": "FullyNN",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "dropout": 0.0,
        "use_ln": False,
        "num_heads": 2,
        "seed": 2019,
        "model_specs": {
            "num_mlp_layers": 3,
            "proper_marked_intensities": True
        },
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5
        }
    },
    "SAHP": {
        "model_id": "SAHP",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "num_heads": 2,
        "mc_num_sample_per_step": 20,
        "sharing_param_layer": False,
        "loss_integral_num_sample_per_step": 20,
        "dropout": 0.0,
        "use_ln": False,
        "seed": 2019,
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5
        }
    },
    "THP": {
        "model_id": "THP",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "num_heads": 2,
        "mc_num_sample_per_step": 20,
        "sharing_param_layer": False,
        "loss_integral_num_sample_per_step": 20,
        "dropout": 0.0,
        "use_ln": False,
        "seed": 2019,
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5
        }
    },
    "IntensityFree": {
        "model_id": "IntensityFree",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "dropout": 0.0,
        "use_ln": False,
        "num_heads": 2,
        "seed": 2019,
        "sharing_param_layer": False,
        "loss_integral_num_sample_per_step": 20,
        "mc_num_sample_per_step": 20,
        "num_mix_components": 3,
        "model_specs": {
            "num_mix_components": 3
        },
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5,
            "num_step_gen": 10
        }
    },
    "AttNHP": {
        "model_id": "AttNHP",
        "hidden_size": 32,
        "time_embed_size": 16,
        "num_layers": 2,
        "num_heads": 4,
        "mc_num_sample_per_step": 20,
        "sharing_param_layer": False,
        "loss_integral_num_sample_per_step": 20,
        "dropout": 0.0,
        "use_ln": False,
        "seed": 2019,
        "thinning": {
            "num_seq": 10,
            "num_sample": 1,
            "num_exp": 500,
            "look_ahead_time": 10,
            "patience_counter": 5,
            "over_sample_rate": 5,
            "num_samples_boundary": 5,
            "dtime_max": 5
        }
    }
}

print("✓ Model specifications loaded for 7 models")

✓ Model specifications loaded for 7 models


### Training Configuration

In [7]:
# Trainer configuration (shared across all experiments)
trainer_config = {
    "batch_size": 256,
    "max_epoch": 200,
    "shuffle": False,
    "optimizer": "adam",
    "learning_rate": 1e-3,
    "valid_freq": 1,
    "use_tfb": False,
    "metrics": ["acc", "rmse"],
    "seed": 2019,
    "gpu": 0
}

print("✓ Trainer configuration loaded")
print(f"  - Batch size: {trainer_config['batch_size']}")
print(f"  - Max epochs: {trainer_config['max_epoch']}")
print(f"  - Learning rate: {trainer_config['learning_rate']}")

✓ Trainer configuration loaded
  - Batch size: 256
  - Max epochs: 200
  - Learning rate: 0.001


## Helper Functions

In [8]:
def create_experiment_config(model_id, data_id, data_spec, model_spec, trainer_cfg):
    """Create a complete experiment configuration."""
    experiment_id = f"{model_id}_{data_id}_train"

    config = {
        "pipeline_config_id": "runner_config",
        "data": {
            data_id: data_spec
        },
        experiment_id: {
            "base_config": {
                "stage": "train",
                "backend": "torch",
                "dataset_id": data_id,
                "runner_id": "std_tpp",
                "model_id": model_id,
                "base_dir": CHECKPOINT_DIR
            },
            "trainer_config": trainer_cfg,
            "model_config": model_spec
        }
    }

    return config, experiment_id

def extract_results_from_logs(log_path):
    """Extract best results from training logs.

    Args:
        log_path: Can be either a log file or a directory containing log files
    """
    try:
        # Determine if log_path is a file or directory
        if os.path.isfile(log_path):
            # It's a file - read it directly
            log_file = log_path
        elif os.path.isdir(log_path):
            # It's a directory - find .log files inside
            log_files = [f for f in os.listdir(log_path) if f.endswith('.log')]
            if not log_files:
                return None
            log_file = os.path.join(log_path, log_files[0])
        else:
            print(f"  Warning: Log path does not exist: {log_path}")
            return None

        # Read the log file
        with open(log_file, 'r') as f:
            lines = f.readlines()

        # Find the LAST test metrics (final epoch results)
        best_ll = None
        best_acc = None
        best_rmse = None

        for line in lines:
            if 'test loglike is' in line:
                try:
                    parts = line.split(',')
                    # Extract log-likelihood
                    ll_part = [p for p in parts if 'test loglike is' in p][0]
                    best_ll = float(ll_part.split('test loglike is')[1].strip())

                    # Extract accuracy
                    acc_parts = [p for p in parts if 'acc is' in p]
                    if acc_parts:
                        best_acc = float(acc_parts[0].split('acc is')[1].strip())

                    # Extract RMSE
                    rmse_parts = [p for p in parts if 'rmse is' in p]
                    if rmse_parts:
                        best_rmse = float(rmse_parts[0].split('rmse is')[1].strip())
                except (IndexError, ValueError) as e:
                    print(f"  Warning: Could not parse line: {e}")
                    continue

        if best_ll is None:
            return None

        return {
            'log_likelihood': best_ll,
            'accuracy': best_acc,
            'rmse': best_rmse
        }
    except Exception as e:
        print(f"  Warning: Could not extract results from logs: {e}")
        return None

def find_latest_checkpoint_dir(base_dir, model_id, data_id):
    """Find the most recent checkpoint directory for an experiment."""
    try:
        # EasyTPP creates directories with random names but puts
        # {model_id}_{data_id}_train_output.yaml inside them
        target_yaml = f"{model_id}_{data_id}_train_output.yaml"

        matching_dirs = []
        for d in os.listdir(base_dir):
            dir_path = os.path.join(base_dir, d)
            if os.path.isdir(dir_path):
                # Check if the target yaml file exists in this directory
                yaml_path = os.path.join(dir_path, target_yaml)
                if os.path.exists(yaml_path):
                    matching_dirs.append(dir_path)

        if not matching_dirs:
            return None

        # Return the most recently modified directory
        latest = max(matching_dirs, key=os.path.getmtime)
        return latest
    except Exception as e:
        print(f"  Warning: Error finding checkpoint dir: {e}")
        return None
print("✓ Helper functions loaded")

✓ Helper functions loaded


In [9]:
def run_model_on_all_datasets(
    model_id,
    model_number,
    total_models,
    aggressive_gpu_clearing=False
):
    """
    Run a single TPP model on all configured datasets.

    This function encapsulates the complete workflow for running a model:
    - Creates and saves experiment configs for each dataset
    - Runs training and evaluation
    - Extracts metrics from logs
    - Saves results (both intermediate and model-specific)
    - Handles errors gracefully
    - Manages GPU memory

    Parameters
    ----------
    model_id : str
        The model identifier (e.g., 'RMTPP', 'NHP', 'AttNHP')
        Must exist as a key in model_spec_dict
    model_number : int
        Current model number (1-7) for progress tracking
    total_models : int
        Total number of models (typically 7) for progress tracking
    aggressive_gpu_clearing : bool, default=False
        If True, performs GPU memory clearing after EACH dataset.
        Use this for memory-intensive models like AttNHP.
        If False, only clears GPU memory after all datasets complete.

    Returns
    -------
    model_results : list of dict
        List containing result dictionaries for each dataset.
        Each dict contains: model, dataset, log_likelihood, accuracy,
        rmse, status, and time_seconds

    Side Effects
    ------------
    - Appends results to the global results_list
    - Saves intermediate results to 'intermediate_results.csv' after each dataset
    - Saves model-specific results to '{model_id}_results.csv'
    - Creates experiment config files in the configs directory
    - Clears GPU memory (timing depends on aggressive_gpu_clearing flag)

    Notes
    -----
    This function relies on several global variables and functions:
    - data_spec_dict: Dictionary of dataset configurations
    - model_spec_dict: Dictionary of model configurations
    - trainer_config: Training configuration
    - results_list: Global list to store all results
    - total_experiments: Total number of experiments
    - BASE_DIR, CHECKPOINT_DIR, RESULTS_DIR: Directory paths
    - create_experiment_config(): Function to create config dicts
    - find_latest_checkpoint_dir(): Function to locate checkpoints
    - extract_results_from_logs(): Function to parse training logs

    Example
    -------
    >>> # Run RMTPP on all datasets (standard memory management)
    >>> rmtpp_results = run_model_on_all_datasets('RMTPP', 1, 7)

    >>> # Run AttNHP with aggressive memory clearing
    >>> attnhp_results = run_model_on_all_datasets('AttNHP', 7, 7,
    ...                                             aggressive_gpu_clearing=True)
    """
    import time
    import os
    import yaml
    import pandas as pd
    import torch
    import gc
    from easy_tpp.config_factory import Config
    from easy_tpp.runner import Runner

    print("\n" + "=" * 70)
    print(f"[MODEL {model_number}/{total_models}] Running {model_id} on all datasets")
    print("=" * 70)

    model_start_time = time.time()
    model_results = []

    for data_id, data_spec in data_spec_dict.items():
        experiment_name = f"{model_id}_{data_id}"
        experiment_num = len(results_list) + 1

        print(f"\n[{experiment_num}/{total_experiments}] Running {model_id} on {data_id} dataset")
        print("=" * 70)

        try:
            # Create config
            config_dict, experiment_id = create_experiment_config(
                model_id=model_id,
                data_id=data_id,
                data_spec=data_spec,
                model_spec=model_spec_dict[model_id],
                trainer_cfg=trainer_config
            )

            # Save config
            CONFIG_DIR = os.path.join(BASE_DIR, 'configs')
            os.makedirs(CONFIG_DIR, exist_ok=True)
            config_path = os.path.join(CONFIG_DIR, f"{experiment_name}.yaml")
            with open(config_path, 'w') as f:
                yaml.dump(config_dict, f)

            # Build and run
            config = Config.build_from_yaml_file(config_path, experiment_id=experiment_id)
            runner = Runner.build_from_config(config)
            runner.run()

            # Find checkpoint directory
            checkpoint_dir = find_latest_checkpoint_dir(CHECKPOINT_DIR, model_id, data_id)

            # Extract results from logs
            if checkpoint_dir:
                log_dir = os.path.join(checkpoint_dir, 'log')
                metrics = extract_results_from_logs(log_dir)

                if metrics:
                    result = {
                        'model': model_id,
                        'dataset': data_id,
                        'log_likelihood': metrics['log_likelihood'],
                        'accuracy': metrics['accuracy'],
                        'rmse': metrics['rmse'],
                        'status': 'success',
                        'time_seconds': time.time() - model_start_time
                    }
                    print(f"✓ {model_id} on {data_id} completed successfully!")
                    print(f"  - Log-Likelihood: {metrics['log_likelihood']:.4f}")
                    if metrics['accuracy'] and metrics['accuracy'] > 0:
                        print(f"  - Accuracy: {metrics['accuracy']:.4f}")
                    if metrics['rmse'] and metrics['rmse'] > 0:
                        print(f"  - RMSE: {metrics['rmse']:.4f}")
                else:
                    result = {
                        'model': model_id,
                        'dataset': data_id,
                        'log_likelihood': None,
                        'accuracy': None,
                        'rmse': None,
                        'status': 'completed_no_metrics',
                        'time_seconds': time.time() - model_start_time
                    }
                    print(f"⚠ {model_id} on {data_id} completed but no metrics found")
            else:
                result = {
                    'model': model_id,
                    'dataset': data_id,
                    'log_likelihood': None,
                    'accuracy': None,
                    'rmse': None,
                    'status': 'no_checkpoint',
                    'time_seconds': time.time() - model_start_time
                }
                print(f"⚠ {model_id} on {data_id} - no checkpoint directory found")

        except Exception as e:
            result = {
                'model': model_id,
                'dataset': data_id,
                'log_likelihood': None,
                'accuracy': None,
                'rmse': None,
                'status': 'failed',
                'error': str(e),
                'time_seconds': time.time() - model_start_time
            }
            print(f"✗ Error running {model_id} on {data_id}: {str(e)}")

        results_list.append(result)
        model_results.append(result)

        # Save intermediate results after each experiment
        intermediate_df = pd.DataFrame(results_list)
        intermediate_df.to_csv(os.path.join(RESULTS_DIR, 'intermediate_results.csv'), index=False)

        # Aggressive GPU memory clearing (for memory-intensive models like AttNHP)
        if aggressive_gpu_clearing:
            try:
                del runner
                del config
            except:
                pass
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()

    # Save model-specific results
    model_df = pd.DataFrame(model_results)
    model_csv_path = os.path.join(RESULTS_DIR, f'{model_id}_results.csv')
    model_df.to_csv(model_csv_path, index=False)

    # Model summary
    model_elapsed = time.time() - model_start_time
    successful = sum(1 for r in model_results if r['status'] == 'success')
    failed = sum(1 for r in model_results if r['status'] == 'failed')

    print(f"\n" + "=" * 70)
    print(f"✓ {model_id} completed on all datasets")
    print(f"  - Time: {model_elapsed/60:.1f} minutes")
    print(f"  - Successful: {successful}/{len(data_spec_dict)}")
    print(f"  - Failed: {failed}/{len(data_spec_dict)}")
    print(f"  - Results saved to: {model_csv_path}")
    print("=" * 70)

    # Clear GPU memory for next model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"✓ GPU memory cleared for next model")

    return model_results

## Run Models On Data

In [10]:
# Initialize results storage
results_list = []
total_experiments = len(model_spec_dict) * len(data_spec_dict)

print(f"Starting {total_experiments} experiments...")
print(f"Running {len(model_spec_dict)} models on {len(data_spec_dict)} datasets")
print(f"Results will be saved incrementally to: {RESULTS_DIR}")
print("=" * 70)

Starting 35 experiments...
Running 7 models on 5 datasets
Results will be saved incrementally to: /content/drive/MyDrive/Colab Notebooks/MilestoneFall2025/results/Task1


In [11]:
model_ids = ['RMTPP', 'NHP', 'FullyNN', 'SAHP', 'THP', 'IntensityFree', 'AttNHP']
all_model_results = {}

for i, model_id in enumerate(model_ids, start=1):
    aggressive = (model_id == 'AttNHP')
    all_model_results[model_id] = run_model_on_all_datasets(
        model_id, i, len(model_ids), aggressive
    )

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[38;20m2025-12-27 20:45:03,866 - tpp_runner.py[pid:5428;line:96:_train_model] - INFO: [ Epoch 191 (train) ]: train loglike is 0.45573251441398116, num_events is 281923[0m
[38;20m2025-12-27 20:45:03,917 - tpp_runner.py[pid:5428;line:107:_train_model] - INFO: [ Epoch 191 (valid) ]:  valid loglike is 0.4618953040096355, num_events is 40073, acc is 0.34197589399346195, rmse is 0.4643988676383742[0m
[38;20m2025-12-27 20:45:04,022 - tpp_runner.py[pid:5428;line:122:_train_model] - INFO: [ Epoch 191 (test) ]: test loglike is 0.4625595009051235, num_events is 82197, acc is 0.34962346557660257, rmse is 0.4632546300784409[0m
[31;1m2025-12-27 20:45:04,025 - tpp_runner.py[pid:5428;line:124:_train_model] - CRITICAL: current best loglike on valid set is 0.4619 (updated at epoch-191), best updated at this epoch[0m
[38;20m2025-12-27 20:45:04,365 - tpp_runner.py[pid:5428;line:96:_train_model] - INFO: [ Epoch 192 (train) ]: train l

## Save Model Results

### Function Definition

In [12]:
"""
Result Formatting and Saving Functions for EasyTPP-style Tables

This module provides functions to format and save model evaluation results
in the style of Table 1 from the EasyTPP paper, with mean/std values for
RMSE and Type Error Rate across multiple datasets.
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Union, Optional


def format_results_table(
    results_df: pd.DataFrame,
    models: Optional[List[str]] = None,
    datasets: Optional[List[str]] = None,
    rmse_col: str = 'rmse',
    accuracy_col: str = 'accuracy',
    type_error_from_accuracy: bool = True
) -> pd.DataFrame:
    """
    Format results into EasyTPP paper style with mean/std for RMSE and Type Error Rate.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing columns: 'model', 'dataset', 'rmse', 'accuracy'
        Can have multiple runs per model-dataset combination for computing std
    models : List[str], optional
        List of model names to include. If None, uses all models in results_df
    datasets : List[str], optional
        List of dataset names to include. If None, uses all datasets in results_df
    rmse_col : str, default='rmse'
        Column name for RMSE values
    accuracy_col : str, default='accuracy'
        Column name for accuracy values
    type_error_from_accuracy : bool, default=True
        If True, compute Type Error Rate as (1 - accuracy) * 100
        If False, assumes results_df has a 'type_error_rate' column

    Returns
    -------
    pd.DataFrame
        Formatted table with:
        - Rows: models
        - Columns: datasets
        - Values: "mean_rmse/mean_type_error% ± std_rmse/std_type_error"

    Examples
    --------
    >>> # Single run per model-dataset
    >>> df = pd.DataFrame({
    ...     'model': ['RMTPP', 'RMTPP', 'NHP', 'NHP'],
    ...     'dataset': ['amazon', 'taxi', 'amazon', 'taxi'],
    ...     'rmse': [0.620, 22.31, 0.621, 21.90],
    ...     'accuracy': [0.319, 0.559, 0.329, 0.600]
    ... })
    >>> table = format_results_table(df)

    >>> # Multiple runs per model-dataset (for computing std)
    >>> df_multi = pd.DataFrame({
    ...     'model': ['RMTPP', 'RMTPP', 'RMTPP'],
    ...     'dataset': ['amazon', 'amazon', 'amazon'],
    ...     'rmse': [0.620, 0.619, 0.621],
    ...     'accuracy': [0.319, 0.320, 0.318]
    ... })
    >>> table = format_results_table(df_multi)
    """
    # Filter models and datasets if specified
    if models is not None:
        results_df = results_df[results_df['model'].isin(models)].copy()
    if datasets is not None:
        results_df = results_df[results_df['dataset'].isin(datasets)].copy()

    # Compute type error rate if needed
    if type_error_from_accuracy:
        results_df['type_error_rate'] = (1 - results_df[accuracy_col]) * 100

    # Get unique models and datasets (sorted)
    all_models = sorted(results_df['model'].unique())
    all_datasets = sorted(results_df['dataset'].unique())

    # Compute mean and std for each model-dataset combination
    grouped = results_df.groupby(['model', 'dataset']).agg({
        rmse_col: ['mean', 'std'],
        'type_error_rate': ['mean', 'std']
    }).reset_index()

    # Flatten column names
    grouped.columns = ['model', 'dataset', 'rmse_mean', 'rmse_std',
                       'type_error_mean', 'type_error_std']

    # Fill NaN std with 0 (for single runs)
    grouped['rmse_std'] = grouped['rmse_std'].fillna(0)
    grouped['type_error_std'] = grouped['type_error_std'].fillna(0)

    # Create formatted strings for each cell
    def format_cell(row):
        """Format a single cell as: mean_rmse/mean_type% ± std_rmse/std_type"""
        rmse_mean = row['rmse_mean']
        rmse_std = row['rmse_std']
        type_mean = row['type_error_mean']
        type_std = row['type_error_std']

        # Format mean values
        mean_str = f"{rmse_mean:.3f}/{type_mean:.1f}%"

        # Format std values
        std_str = f"{rmse_std:.3f}/{type_std:.3f}"

        return f"{mean_str} ± {std_str}"

    grouped['formatted'] = grouped.apply(format_cell, axis=1)

    # Pivot to create the final table
    formatted_table = grouped.pivot(index='model', columns='dataset', values='formatted')

    # Ensure all models and datasets are present (fill missing with N/A)
    formatted_table = formatted_table.reindex(index=all_models, columns=all_datasets, fill_value='N/A')

    return formatted_table


def save_results_easytpp_format(
    results_df: pd.DataFrame,
    output_path: str,
    models: Optional[List[str]] = None,
    datasets: Optional[List[str]] = None,
    save_csv: bool = True,
    save_latex: bool = True,
    save_markdown: bool = True,
    **format_kwargs
) -> pd.DataFrame:
    """
    Save results in EasyTPP paper format to multiple file formats.

    Parameters
    ----------
    results_df : pd.DataFrame
        DataFrame containing columns: 'model', 'dataset', 'rmse', 'accuracy'
    output_path : str
        Base path for output files (without extension)
        E.g., 'results/task1_results' will create:
        - results/task1_results.csv
        - results/task1_results.tex
        - results/task1_results.md
    models : List[str], optional
        List of model names to include
    datasets : List[str], optional
        List of dataset names to include
    save_csv : bool, default=True
        Whether to save as CSV
    save_latex : bool, default=True
        Whether to save as LaTeX table
    save_markdown : bool, default=True
        Whether to save as Markdown table
    **format_kwargs
        Additional keyword arguments passed to format_results_table()

    Returns
    -------
    pd.DataFrame
        The formatted results table

    Examples
    --------
    >>> df = pd.DataFrame({
    ...     'model': ['RMTPP', 'NHP'],
    ...     'dataset': ['amazon', 'taxi'],
    ...     'rmse': [0.620, 21.90],
    ...     'accuracy': [0.319, 0.600]
    ... })
    >>> table = save_results_easytpp_format(df, 'results/task1')
    """
    import os

    # Format the table
    formatted_table = format_results_table(
        results_df, models=models, datasets=datasets, **format_kwargs
    )

    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    # Save as CSV
    if save_csv:
        csv_path = f"{output_path}.csv"
        formatted_table.to_csv(csv_path)
        print(f"✓ Results saved to: {csv_path}")

    # Save as LaTeX
    if save_latex:
        latex_path = f"{output_path}.tex"
        latex_str = formatted_table.to_latex(
            escape=False,
            column_format='l' + 'c' * len(formatted_table.columns),
            caption="Model Performance: Time RMSE / Type Error Rate (Mean ± Std)",
            label="tab:results"
        )
        with open(latex_path, 'w') as f:
            f.write(latex_str)
        print(f"✓ LaTeX table saved to: {latex_path}")

    # Save as Markdown
    if save_markdown:
        md_path = f"{output_path}.md"
        md_str = formatted_table.to_markdown()
        with open(md_path, 'w') as f:
            f.write(md_str)
        print(f"✓ Markdown table saved to: {md_path}")

    return formatted_table

def create_comparison_table(
    stationary_results_df: pd.DataFrame,
    nonstationary_results_df: pd.DataFrame,
    output_path: str,
    models: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Create a side-by-side comparison table for stationary vs non-stationary results.

    Parameters
    ----------
    stationary_results_df : pd.DataFrame
        Results from stationary Hawkes data
    nonstationary_results_df : pd.DataFrame
        Results from non-stationary Hawkes data
    output_path : str
        Base path for output files
    models : List[str], optional
        List of models to include

    Returns
    -------
    pd.DataFrame
        Comparison table with degradation analysis

    Examples
    --------
    >>> stat_df = pd.DataFrame({...})  # stationary results
    >>> nonstat_df = pd.DataFrame({...})  # non-stationary results
    >>> comparison = create_comparison_table(stat_df, nonstat_df, 'results/comparison')
    """
    # Format both tables
    stat_table = format_results_table(stationary_results_df, models=models)
    nonstat_table = format_results_table(nonstationary_results_df, models=models)

    # Compute degradation statistics
    stat_grouped = stationary_results_df.groupby('model').agg({
        'rmse': 'mean',
        'accuracy': lambda x: (1 - x.mean()) * 100
    }).rename(columns={'rmse': 'stat_rmse', 'accuracy': 'stat_type_error'})

    nonstat_grouped = nonstationary_results_df.groupby('model').agg({
        'rmse': 'mean',
        'accuracy': lambda x: (1 - x.mean()) * 100
    }).rename(columns={'rmse': 'nonstat_rmse', 'accuracy': 'nonstat_type_error'})

    degradation = stat_grouped.join(nonstat_grouped)
    degradation['rmse_degradation_%'] = (
        (degradation['nonstat_rmse'] - degradation['stat_rmse']) /
        degradation['stat_rmse'] * 100
    )
    degradation['type_error_degradation_%'] = (
        degradation['nonstat_type_error'] - degradation['stat_type_error']
    )

    # Save comparison
    degradation.to_csv(f"{output_path}_degradation.csv")
    print(f"✓ Degradation analysis saved to: {output_path}_degradation.csv")

    return degradation

### Function Execution

In [13]:
model_ids = ['RMTPP', 'NHP', 'SAHP', 'THP', 'AttNHP', 'FullyNN','IntensityFree']
data_ids = ['amazon', 'retweet', 'taxi','taobao', 'stackoverflow']

In [14]:
import pandas as pd

def flatten_model_results(all_model_results):
    """
    Properly convert all_model_results dictionary to DataFrame.

    The issue: pd.DataFrame(dict) treats keys as COLUMN names, not data rows.
    This function flattens it correctly.
    """
    all_results = []
    for model_name, results_list in all_model_results.items():
        all_results.extend(results_list)
    return pd.DataFrame(all_results)

results_df = flatten_model_results(all_model_results)
results_df = results_df[
    results_df['status'] == 'success'
]

# Check the structure
print("✓ DataFrame created successfully!")
print(f"  Shape: {results_df.shape}")
print(f"  Columns: {list(results_df.columns)}")
print(f"\nFirst few rows:")
print(results_df.head())

formatted_table = format_results_table(
    results_df,
    models=model_ids,
    datasets=data_ids
)

print("\n✓ Formatted table:")
print(formatted_table)

# Save to files
saved_table = save_results_easytpp_format(
    results_df,
    output_path=f'{RESULTS_DIR}/Task1_results',
    models=model_ids,
    datasets=data_ids
)

✓ DataFrame created successfully!
  Shape: (30, 8)
  Columns: ['model', 'dataset', 'log_likelihood', 'accuracy', 'rmse', 'status', 'time_seconds', 'error']

First few rows:
   model        dataset  log_likelihood  accuracy       rmse   status  \
0  RMTPP           taxi        0.306759  0.911720   0.395955  success   
1  RMTPP         amazon       -2.191249  0.329525   0.460787  success   
2  RMTPP         taobao       -0.571186  0.435700   0.266665  success   
3  RMTPP  stackoverflow       -2.756299  0.425011   1.417137  success   
4  RMTPP        retweet       -4.226499  0.553392  25.647717  success   

   time_seconds error  
0     93.595191   NaN  
1    205.396454   NaN  
2    232.595419   NaN  
3    271.393489   NaN  
4    399.904712   NaN  

✓ Formatted table:
dataset                              amazon                     retweet  \
model                                                                     
AttNHP         1126.644/80.6% ± 0.000/0.000  20.787/41.0% ± 0.000/0.000   