In [1]:
# %% [markdown]
# # Transformer Hawkes Process Example
# 
# This notebook demonstrates a simplified version of the Transformer Hawkes Process. We:
# 
# - Compute a temporal encoding using cosine and sine functions.
# - Embed discrete event types (which are one-hot encoded) into a continuous space.
# - Combine these representations.
# - Use PyTorch’s multi-head self-attention to model dependencies.
# - Pass the output through a feed-forward network.
# - Compute a simple conditional intensity (using a linear layer and a softplus activation).
# 
# This basic framework can be extended (for example, by replacing the fixed time-dependent term with a nonparametric smoother or incorporating additional covariates via tree regression/BART) to model the excitation function more flexibly.

# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np

# %%
# Temporal Encoding using sine and cosine functions.
# In this implementation, we follow the standard Transformer positional encoding.
class TemporalEncoding(nn.Module):
    def __init__(self, M):
        """
        M: the dimension of the encoding.
        """
        super(TemporalEncoding, self).__init__()
        self.M = M

    def forward(self, t):
        """
        t: a tensor of shape (L,) representing timestamps.
        Returns a tensor of shape (L, M) where each row is the temporal encoding.
        """
        L = t.size(0)
        # Create a tensor of positions (we treat t as positions)
        # Following the standard positional encoding, we compute:
        # PE(pos, 2i)   = cos(pos / (10000^(2i/M)))
        # PE(pos, 2i+1) = sin(pos / (10000^(2i/M)))
        pe = torch.zeros(L, self.M)
        position = t.unsqueeze(1)  # shape (L, 1)
        div_term = torch.exp(torch.arange(0, self.M, 2, dtype=torch.float32) * 
                             (-math.log(10000.0) / self.M))
        pe[:, 0::2] = torch.cos(position * div_term)
        pe[:, 1::2] = torch.sin(position * div_term)
        return pe

# %%
# Event Type Embedding
# One-hot encoding is used to represent each discrete event type as a vector with a 1 in the coordinate corresponding to the event type.
# This vector is then mapped to a continuous space via an embedding layer.
class EventEmbedding(nn.Module):
    def __init__(self, num_event_types, M):
        """
        num_event_types: number of discrete event types.
        M: embedding dimension.
        """
        super(EventEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_event_types, M)
    
    def forward(self, event_types):
        """
        event_types: tensor of shape (L,) containing integer indices for event types.
        Returns a tensor of shape (L, M) with the continuous embeddings.
        """
        return self.embedding(event_types)

# %%
# The simplified Transformer Hawkes Process module.
# This module combines the event embedding and temporal encoding, applies multi-head self-attention,
# then a feed-forward network, and finally computes an intensity using a linear layer plus softplus.
class TransformerHawkesProcess(nn.Module):
    def __init__(self, num_event_types, M, num_heads, d_ff):
        """
        num_event_types: number of discrete event types.
        M: embedding dimension.
        num_heads: number of attention heads.
        d_ff: dimension of the feed-forward layer.
        """
        super(TransformerHawkesProcess, self).__init__()
        self.M = M
        self.num_event_types = num_event_types
        self.event_embed = EventEmbedding(num_event_types, M)
        self.temporal_encoding = TemporalEncoding(M)
        # Multi-head attention (we use batch_first=True for convenience)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=M, num_heads=num_heads, batch_first=True)
        # A simple position-wise feed-forward network
        self.ffn = nn.Sequential(
            nn.Linear(M, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, M)
        )
        # Intensity computation: map the hidden representation to num_event_types outputs.
        self.intensity_linear = nn.Linear(M, num_event_types)
        self.softplus = nn.Softplus()

    def forward(self, event_types, times, attn_mask=None):
        """
        event_types: tensor of shape (B, L) of integer event types.
        times: tensor of shape (B, L) of event timestamps.
        attn_mask: optional mask for attention (e.g., to prevent attending to future events).
        Returns:
          intensity: tensor of shape (B, L, num_event_types), the computed intensities.
          H: tensor of shape (B, L, M), the hidden representations.
          attn_weights: attention weights from the multi-head attention.
        """
        B, L = event_types.shape
        
        # Get the event type embeddings: shape (B, L, M)
        emb_events = self.event_embed(event_types)
        
        # Get temporal encodings: we process each sequence in the batch separately.
        pe = []
        for b in range(B):
            pe.append(self.temporal_encoding(times[b]))
        pe = torch.stack(pe, dim=0)  # shape (B, L, M)
        
        # Combined representation: sum of event embedding and temporal encoding.
        X = emb_events + pe  # shape (B, L, M)
        
        # Apply multi-head self-attention.
        # PyTorch's MultiheadAttention expects queries, keys, values of shape (B, L, M) when batch_first=True.
        attn_output, attn_weights = self.multihead_attn(X, X, X, key_padding_mask=attn_mask)
        # Add residual connection
        X2 = X + attn_output
        
        # Apply the feed-forward network with a residual connection.
        ffn_output = self.ffn(X2)
        H = X2 + ffn_output  # Final hidden representation
        
        # Compute the intensity.
        # Here, we apply a linear transformation to each hidden state and then use softplus to ensure positivity.
        intensity_raw = self.intensity_linear(H)  # shape (B, L, num_event_types)
        intensity = self.softplus(intensity_raw)
        
        return intensity, H, attn_weights

# %%
# Example usage in a Jupyter Notebook cell.
if __name__ == '__main__':
    # Set random seed for reproducibility
    torch.manual_seed(42)
    
    # Parameters
    num_event_types = 4  # e.g., event types: 0, 1, 2, 3
    M = 32               # embedding/encoding dimension
    num_heads = 4
    d_ff = 64            # feed-forward network dimension
    batch_size = 2
    seq_length = 10

    # Generate random event sequences.
    # event_types: tensor of shape (B, L) with values in {0, 1, 2, 3}.
    event_types = torch.randint(0, num_event_types, (batch_size, seq_length))
    
    # Generate increasing timestamps for each sequence.
    times = torch.zeros(batch_size, seq_length)
    for b in range(batch_size):
        # Generate random timestamps and sort them (simulate increasing event times)
        times[b] = torch.sort(torch.rand(seq_length) * 100)[0]

    # Initialize the Transformer Hawkes Process model.
    model = TransformerHawkesProcess(num_event_types, M, num_heads, d_ff)
    
    # Forward pass.
    intensity, H, attn_weights = model(event_types, times)
    
    print("Event Types:\n", event_types)
    print("\nTimestamps:\n", times)
    print("\nIntensity (sample output):\n", intensity)
    print("\nHidden Representations (H):\n", H)
    print("\nAttention Weights:\n", attn_weights)


Event Types:
 tensor([[2, 3, 0, 2, 2, 3, 0, 0, 2, 1],
        [2, 2, 2, 2, 3, 0, 3, 3, 3, 2]])

Timestamps:
 tensor([[ 0.6160, 10.5315, 19.9364, 26.9495, 26.9632, 29.6921, 35.8813, 44.1364,
         54.7192, 83.1685],
        [ 7.5266, 33.7648, 34.2313, 55.4660, 57.7925, 58.3210, 80.8975, 88.6014,
         90.3982, 95.1555]])

Intensity (sample output):
 tensor([[[0.8378, 0.4360, 1.2342, 0.3594],
         [0.5869, 0.6222, 0.9905, 0.3738],
         [0.2763, 0.2164, 0.7110, 0.3050],
         [0.9372, 0.4644, 0.9110, 0.1692],
         [0.9372, 0.4645, 0.9102, 0.1686],
         [0.6751, 0.5743, 0.6762, 0.2003],
         [0.4894, 0.1575, 0.4467, 0.3028],
         [0.3457, 0.1447, 0.6296, 0.2348],
         [0.8346, 0.5755, 1.0201, 0.1025],
         [0.6928, 0.2060, 0.4869, 0.3033]],

        [[0.7931, 0.5812, 1.5989, 0.2888],
         [1.0565, 0.4085, 0.8247, 0.1955],
         [1.0835, 0.4010, 0.7911, 0.1923],
         [0.8851, 0.5674, 1.0196, 0.1317],
         [0.7669, 0.6580, 0.3445, 0.222

In [2]:
# %% [markdown]
# ## Model 2: Smoother Transformer Hawkes Process
# 
# This variant extends Model 1 by incorporating a nonparametric smoother on the elapsed time between events.  
# 
# For each event (except the first), we compute:
# 
# \[
# \Delta t_j = t_j - t_{j-1}.
# \]
# 
# Then we pass \(\Delta t_j\) through a small MLP (our nonparametric smoother) to obtain a scalar \(s_j\). This scalar is added to the linear intensity before applying softplus:
# 
# \[
# \lambda_j = \text{softplus}\Bigl(\text{Linear}(h(t_j)) + s_j\Bigr).
# \]

# %%
class SmootherTransformerHawkesProcess(nn.Module):
    def __init__(self, num_event_types, M, num_heads, d_ff):
        super(SmootherTransformerHawkesProcess, self).__init__()
        self.M = M
        self.event_embed = EventEmbedding(num_event_types, M)
        self.temporal_encoding = TemporalEncoding(M)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=M, num_heads=num_heads, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(M, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, M)
        )
        # Linear intensity computation as in Model 1.
        self.intensity_linear = nn.Linear(M, num_event_types)
        # Nonparametric smoother: a small MLP that takes a scalar (elapsed time) and outputs a scalar.
        self.smoother = nn.Sequential(
            nn.Linear(1, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
        self.softplus = nn.Softplus()

    def forward(self, event_types, times, attn_mask=None):
        """
        event_types: tensor of shape (B, L)
        times: tensor of shape (B, L)
        Returns:
          intensity: (B, L, num_event_types)
          H: (B, L, M)
          attn_weights: from attention.
        """
        B, L = event_types.shape
        emb_events = self.event_embed(event_types)  # (B, L, M)
        pe = torch.stack([self.temporal_encoding(times[b]) for b in range(B)], dim=0)  # (B, L, M)
        X = emb_events + pe  # (B, L, M)
        
        # Self-attention.
        attn_output, attn_weights = self.multihead_attn(X, X, X, key_padding_mask=attn_mask)
        X2 = X + attn_output
        ffn_output = self.ffn(X2)
        H = X2 + ffn_output  # (B, L, M)
        
        # Compute the linear part of the intensity.
        intensity_raw = self.intensity_linear(H)  # (B, L, num_event_types)
        
        # Compute elapsed time for each event (set the first elapsed time to zero).
        # times: (B, L)
        elapsed = torch.zeros_like(times)
        elapsed[:, 1:] = times[:, 1:] - times[:, :-1]
        # Reshape elapsed to (B, L, 1) and pass through the smoother.
        elapsed_unsq = elapsed.unsqueeze(-1)
        s = self.smoother(elapsed_unsq)  # (B, L, 1)
        
        # Add the smoother output to the intensity_raw (broadcasting the scalar to each event type).
        intensity_enhanced = intensity_raw + s  # (B, L, num_event_types)
        intensity = self.softplus(intensity_enhanced)
        
        return intensity, H, attn_weights

In [3]:
# %% [markdown]
# ## Model 3: Set Aggregation Transformer Hawkes Process
# 
# In this variant, we:
# - **Concatenate** the event embedding and temporal encoding (instead of summing them) and then project the result.
# - Compute a global context vector via mean pooling over the sequence.
# - Combine the global context with the local hidden representation before computing the intensity.
# 
# This approach is inspired by set aggregation methods.

# %%
class SetAggregationTransformerHawkesProcess(nn.Module):
    def __init__(self, num_event_types, M, num_heads, d_ff):
        super(SetAggregationTransformerHawkesProcess, self).__init__()
        self.M = M
        self.event_embed = EventEmbedding(num_event_types, M)
        self.temporal_encoding = TemporalEncoding(M)
        # Instead of summing, we will concatenate embeddings of size M and temporal encoding of size M, then project to M.
        self.proj = nn.Linear(2 * M, M)
        
        self.multihead_attn = nn.MultiheadAttention(embed_dim=M, num_heads=num_heads, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(M, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, M)
        )
        # Global context: a linear layer to map the mean-pooled vector to M.
        self.global_context = nn.Linear(M, M)
        
        self.intensity_linear = nn.Linear(M, num_event_types)
        self.softplus = nn.Softplus()
        
    def forward(self, event_types, times, attn_mask=None):
        """
        event_types: (B, L)
        times: (B, L)
        Returns:
          intensity: (B, L, num_event_types)
          H: (B, L, M)
          attn_weights: from attention.
        """
        B, L = event_types.shape
        emb_events = self.event_embed(event_types)  # (B, L, M)
        pe = torch.stack([self.temporal_encoding(times[b]) for b in range(B)], dim=0)  # (B, L, M)
        # Concatenate along the last dimension: shape becomes (B, L, 2*M)
        X_cat = torch.cat([emb_events, pe], dim=-1)
        # Project back to dimension M.
        X = self.proj(X_cat)  # (B, L, M)
        
        # Self-attention.
        attn_output, attn_weights = self.multihead_attn(X, X, X, key_padding_mask=attn_mask)
        X2 = X + attn_output
        ffn_output = self.ffn(X2)
        H = X2 + ffn_output  # (B, L, M)
        
        # Compute a global context vector by mean pooling over the sequence.
        global_vec = H.mean(dim=1)  # (B, M)
        global_context = self.global_context(global_vec)  # (B, M)
        # Expand the global context to each time step.
        global_context_expanded = global_context.unsqueeze(1).expand(-1, L, -1)
        
        # Combine local and global information.
        H_combined = H + global_context_expanded
        
        # Compute intensity.
        intensity_raw = self.intensity_linear(H_combined)  # (B, L, num_event_types)
        intensity = self.softplus(intensity_raw)
        
        return intensity, H, attn_weights

In [4]:
# %% [markdown]
# ## Testing the Three Models
# 
# We now generate sample data and run a forward pass through each of the three models.

# %%
def generate_sample_data(batch_size, seq_length, num_event_types):
    # Generate random event types in {0, ..., num_event_types-1}
    event_types = torch.randint(0, num_event_types, (batch_size, seq_length))
    # Generate increasing timestamps for each sequence.
    times = torch.zeros(batch_size, seq_length)
    for b in range(batch_size):
        times[b] = torch.sort(torch.rand(seq_length) * 100)[0]
    return event_types, times

# %%
# Parameters
num_event_types = 4   # For example, event types: 0, 1, 2, 3
M = 32                # Embedding/encoding dimension
num_heads = 4
d_ff = 64             # Feed-forward network dimension
batch_size = 2
seq_length = 10

# Generate sample data.
event_types, times = generate_sample_data(batch_size, seq_length, num_event_types)

# %%
# Model 1: Basic Transformer Hawkes Process
model1 = BasicTransformerHawkesProcess(num_event_types, M, num_heads, d_ff)
intensity1, H1, attn_weights1 = model1(event_types, times)
print("=== Model 1: Basic Transformer Hawkes ===")
print("Event Types:\n", event_types)
print("Timestamps:\n", times)
print("Intensity (Model 1):\n", intensity1)
print("Hidden Representations (H1):\n", H1)
print("Attention Weights (Model 1):\n", attn_weights1)

# %%
# Model 2: Smoother Transformer Hawkes Process
model2 = SmootherTransformerHawkesProcess(num_event_types, M, num_heads, d_ff)
intensity2, H2, attn_weights2 = model2(event_types, times)
print("\n=== Model 2: Smoother Transformer Hawkes ===")
print("Intensity (Model 2):\n", intensity2)
print("Hidden Representations (H2):\n", H2)
print("Attention Weights (Model 2):\n", attn_weights2)

# %%
# Model 3: Set Aggregation Transformer Hawkes Process
model3 = SetAggregationTransformerHawkesProcess(num_event_types, M, num_heads, d_ff)
intensity3, H3, attn_weights3 = model3(event_types, times)
print("\n=== Model 3: Set Aggregation Transformer Hawkes ===")
print("Intensity (Model 3):\n", intensity3)
print("Hidden Representations (H3):\n", H3)
print("Attention Weights (Model 3):\n", attn_weights3)


NameError: name 'BasicTransformerHawkesProcess' is not defined