# Part 1: Preprocessing

In [1]:
# Import necessary libraries
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter


In [3]:
# Function to load and parse data from all files
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split(' : ')
                if len(parts) == 4:
                    event_type, diagram, amplitude, squared_amplitude = parts
                    data.append({
                        'event_type': event_type,
                        'diagram': diagram,
                        'amplitude': amplitude.strip(),
                        'squared_amplitude': squared_amplitude.strip()
                    })
    return pd.DataFrame(data)


In [4]:
def normalize_indices(expr):
    # Find all patterns like %something_number
    pattern = r'(%[^_]+)_(\d+)'
    matches = re.findall(pattern, expr)
    
    # Get unique numeric indices for each variable type
    var_indices = {}
    for var_type, num_idx in matches:
        if var_type not in var_indices:
            var_indices[var_type] = set()
        var_indices[var_type].add(num_idx)
    
    # Create mapping from original indices to normalized ones for each variable type
    index_maps = {}
    for var_type, indices in var_indices.items():
        sorted_indices = sorted(indices, key=int)
        index_maps[var_type] = {orig_idx: str(i+1) for i, orig_idx in enumerate(sorted_indices)}
    
    # Replace indices according to the mapping
    def replace_match(m):
        var_type, num_idx = m.groups()
        return f"{var_type}_{index_maps[var_type][num_idx]}"
    
    normalized_expr = re.sub(pattern, replace_match, expr)
    
    return normalized_expr





In [5]:
# Tokenization function for mathematical expressions
def tokenize_expression(expr):
    # Define patterns for different token types
    patterns = [
        # Constants and numbers
        r'(\d+/\d+|\d+\.\d+|\d+)',
        # Variables with indices and special notations
        r'([a-zA-Z]+(?:_[a-zA-Z0-9]+)?(?:\^\([*]\))?)',
        # Mathematical operators and symbols
        r'([\+\-\*/\^\(\)\[\]\{\}])',
        # Special symbols and groupings
        r'(%[a-zA-Z]+_\d+|_{[^}]+})'
    ]
    
    # Combine patterns
    combined_pattern = '|'.join(patterns)
    tokens = re.findall(combined_pattern, expr)
    
    # Flatten and filter empty strings
    tokens = [t for sublist in tokens for t in sublist if t]
    return tokens


In [None]:
# Example of how the tokenization works
example_expr = "-1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_146}*gamma_{%\sigma_165,%gam_147,%del_137}*e_{i_3,%gam_146}(p_1)_u*e_{k_3,%del_137}(p_2)_u*e_{l_3,%gam_145}(p_3)_u^(*)*e_{i_5,%gam_147}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)"
tokens = tokenize_expression(example_expr)
print("Original expression:", example_expr)
print("Tokenized expression:", tokens)


Original expression: -1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_146}*gamma_{%\sigma_165,%gam_147,%del_137}*e_{i_3,%gam_146}(p_1)_u*e_{k_3,%del_137}(p_2)_u*e_{l_3,%gam_145}(p_3)_u^(*)*e_{i_5,%gam_147}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)
Tokenized expression: ['-', '1/2', '*', 'i', '*', 'e', '^', '2', '*', 'gamma', '_{+%\\sigma_165,%gam_145,%gam_146}', '*', 'gamma', '_{%\\sigma_165,%gam_147,%del_137}', '*', 'e', '_{i_3,%gam_146}', '(', 'p_1', ')', 'u', '*', 'e', '_{k_3,%del_137}', '(', 'p_2', ')', 'u', '*', 'e', '_{l_3,%gam_145}', '(', 'p_3', ')', 'u^(*)', '*', 'e', '_{i_5,%gam_147}', '(', 'p_4', ')', 'u^(*)', '/', '(', 'm_e', '^', '2', '+', '-', 's_13', '+', '1/2', '*', 'reg_prop', ')']


In [7]:
example_expr = "-i*e^2*gamma_{+%\sigma_157721,%gam_166722,%eps_44575}*gamma_{%\sigma_157721,%gam_166723,%del_106099}*e_{i_36289,%del_106099}(p_3)_v*e_{k_36277,%gam_166723}(p_1)_v^(*)*mu_{l_36277,%gam_166722}(p_2)_v^(*)*mu_{j_36269,%eps_44575}(p_4)_v/(m_e^2 + (-2)*s_13 + s_33 + reg_prop)"
normalized = normalize_indices(example_expr)
print("Original expression:", example_expr)
print("Normalized expression:", normalized)

Original expression: -i*e^2*gamma_{+%\sigma_157721,%gam_166722,%eps_44575}*gamma_{%\sigma_157721,%gam_166723,%del_106099}*e_{i_36289,%del_106099}(p_3)_v*e_{k_36277,%gam_166723}(p_1)_v^(*)*mu_{l_36277,%gam_166722}(p_2)_v^(*)*mu_{j_36269,%eps_44575}(p_4)_v/(m_e^2 + (-2)*s_13 + s_33 + reg_prop)
Normalized expression: -i*e^2*gamma_{+%\sigma_1,%gam_1,%eps_1}*gamma_{%\sigma_1,%gam_2,%del_1}*e_{i_36289,%del_1}(p_3)_v*e_{k_36277,%gam_2}(p_1)_v^(*)*mu_{l_36277,%gam_1}(p_2)_v^(*)*mu_{j_36269,%eps_1}(p_4)_v/(m_e^2 + (-2)*s_13 + s_33 + reg_prop)


In [8]:
file_paths = [f"SYMBA - Test Data\QED-2-to-2-diag-TreeLevel-{i}.txt" for i in range(0, 10)]

# Load the sample data
df = load_data(file_paths)

# Display the first few rows
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (15552, 4)


Unnamed: 0,event_type,diagram,amplitude,squared_amplitude
0,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_1:e(X_2), e(X_4), OffShell A(V_1), V...","-1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_1...",2*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23...
1,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_0:e(X_2), e(X_3), OffShell A(V_0), V...","1/2*i*e^2*gamma_{+%\sigma_172,%gam_162,%del_14...",2*e^4*(m_e^4 + -1/2*m_e^2*s_14 + -1/2*m_e^2*s_...
2,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_1:e(X_2), OffShell e(X_4), OffShell...","-1/2*i*e^2*gamma_{+%\sigma_293,%gam_358,%gam_3...",2*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23...
3,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_0:e(X_2), e(X_3), OffShell A(V_0), V...","1/2*i*e^2*gamma_{+%\sigma_301,%gam_377,%del_27...",2*e^4*(m_e^4 + -1/2*m_e^2*s_14 + -1/2*m_e^2*s_...
4,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_1:e(X_2), e(X_4), OffShell A(V_1), V...","-i*e^2*gamma_{+%\sigma_435,%gam_574,%gam_575}*...",8*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23...


In [9]:
# Normalize indices in amplitudes and squared amplitudes
df['normalized_amplitude'] = df['amplitude'].apply(normalize_indices)
df['normalized_squared_amplitude'] = df['squared_amplitude'].apply(normalize_indices)

# Display an example of normalization
print("Original amplitude:")
print(df['amplitude'].iloc[0])
print("\nNormalized amplitude:")
print(df['normalized_amplitude'].iloc[0])

Original amplitude:
-1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_146}*gamma_{%\sigma_165,%gam_147,%del_137}*e_{i_3,%gam_146}(p_1)_u*e_{k_3,%del_137}(p_2)_u*e_{l_3,%gam_145}(p_3)_u^(*)*e_{i_5,%gam_147}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)

Normalized amplitude:
-1/2*i*e^2*gamma_{+%\sigma_1,%gam_1,%gam_2}*gamma_{%\sigma_1,%gam_3,%del_1}*e_{i_3,%gam_2}(p_1)_u*e_{k_3,%del_1}(p_2)_u*e_{l_3,%gam_1}(p_3)_u^(*)*e_{i_5,%gam_3}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)


In [10]:
# Tokenize normalized expressions
df['tokenized_amplitude'] = df['normalized_amplitude'].apply(tokenize_expression)
df['tokenized_squared_amplitude'] = df['normalized_squared_amplitude'].apply(tokenize_expression)

# Display an example of tokenization
print("Normalized amplitude:")
print(df['normalized_amplitude'].iloc[0])
print("\nTokenized amplitude (first 20 tokens):")
print(df['tokenized_amplitude'].iloc[0][:20])

Normalized amplitude:
-1/2*i*e^2*gamma_{+%\sigma_1,%gam_1,%gam_2}*gamma_{%\sigma_1,%gam_3,%del_1}*e_{i_3,%gam_2}(p_1)_u*e_{k_3,%del_1}(p_2)_u*e_{l_3,%gam_1}(p_3)_u^(*)*e_{i_5,%gam_3}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)

Tokenized amplitude (first 20 tokens):
['-', '1/2', '*', 'i', '*', 'e', '^', '2', '*', 'gamma', '_{+%\\sigma_1,%gam_1,%gam_2}', '*', 'gamma', '_{%\\sigma_1,%gam_3,%del_1}', '*', 'e', '_{i_3,%gam_2}', '(', 'p_1', ')']


In [11]:
# Split into train, validation, and test sets (80-10-10)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 12441
Validation set size: 1555
Test set size: 1556


In [12]:
# Analyze token distribution
all_tokens = []
for tokens in df['tokenized_amplitude'] + df['tokenized_squared_amplitude']:
    all_tokens.extend(tokens)

token_counts = Counter(all_tokens)
print(f"Total unique tokens: {len(token_counts)}")
print(f"Most common tokens: {token_counts.most_common(10)}")


Total unique tokens: 44416
Most common tokens: [('*', 575442), ('(', 285768), (')', 285768), ('+', 272448), ('2', 215021), ('^', 198555), ('-', 168857), ('e', 70464), ('reg_prop', 62784), ('gamma', 58752)]


In [13]:
# Dump into a pickle file
outout_dir = "data"
os.makedirs(outout_dir, exist_ok=True)
train_df.to_pickle(os.path.join(outout_dir, "train.pkl"))
val_df.to_pickle(os.path.join(outout_dir, "val.pkl"))
test_df.to_pickle(os.path.join(outout_dir, "test.pkl"))


# Part 2

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the data back from the pickle file
_dir = "data"

In [2]:
# Define the dataset class for amplitude data
class AmplitudeDataset(Dataset):
    def __init__(self, amplitudes, squared_amplitudes, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.amplitudes = amplitudes
        self.squared_amplitudes = squared_amplitudes
        self.max_length = max_length
        
    def __len__(self):
        return len(self.amplitudes)
    
    def __getitem__(self, idx):
        amplitude = ' '.join(self.amplitudes[idx])
        squared_amplitude = ' '.join(self.squared_amplitudes[idx])
        
        # Tokenize inputs
        input_encoding = self.tokenizer(
            amplitude,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize targets
        target_encoding = self.tokenizer(
            squared_amplitude,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': input_encoding.input_ids.squeeze(),
            'attention_mask': input_encoding.attention_mask.squeeze(),
            'labels': target_encoding.input_ids.squeeze()
        }

In [3]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Add special tokens for mathematical expressions
special_tokens = ['*', '/', '+', '-', '^', '(', ')', '{', '}', '_', 'gamma', 'sigma', 'e^2']
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Display tokenizer information
print(f"Vocabulary size after adding special tokens: {len(tokenizer)}")
print(f"Special tokens added: {special_tokens}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Vocabulary size after adding special tokens: 32106
Special tokens added: ['*', '/', '+', '-', '^', '(', ')', '{', '}', '_', 'gamma', 'sigma', 'e^2']


In [17]:
# Create datasets
train_dataset = AmplitudeDataset(
    train_df['tokenized_amplitude'].tolist(),
    train_df['tokenized_squared_amplitude'].tolist(),
    tokenizer
)

val_dataset = AmplitudeDataset(
    val_df['tokenized_amplitude'].tolist(),
    val_df['tokenized_squared_amplitude'].tolist(),
    tokenizer
)

test_dataset = AmplitudeDataset(
    test_df['tokenized_amplitude'].tolist(),
    test_df['tokenized_squared_amplitude'].tolist(),
    tokenizer
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")


Number of training batches: 1556
Number of validation batches: 195
Number of test batches: 195


In [18]:
# Check a sample batch to verify data loading
sample_batch = next(iter(train_loader))
print(f"Input shape: {sample_batch['input_ids'].shape}")
print(f"Attention mask shape: {sample_batch['attention_mask'].shape}")
print(f"Labels shape: {sample_batch['labels'].shape}")

# Decode a sample input and output
sample_input = tokenizer.decode(sample_batch['input_ids'][0], skip_special_tokens=True)
sample_output = tokenizer.decode(sample_batch['labels'][0], skip_special_tokens=True)

print("\nSample input:")
print(sample_input[:100] + "..." if len(sample_input) > 100 else sample_input)
print("\nSample output:")
print(sample_output[:100] + "..." if len(sample_output) > 100 else sample_output)


Input shape: torch.Size([8, 512])
Attention mask shape: torch.Size([8, 512])
Labels shape: torch.Size([8, 512])

Sample input:
2/9 * i * e ^ 2 * gamma _ { +%nu_1,%eta_1,%eps_1 } * gamma _ { %nu_1,%gam_1,%eta_2 } * c _ { k_19330...

Sample output:
4/81 * e ^ 4 * ( 16 * m_c ^ 2 * m_d ^ 2 + 8 * m_d ^ 2 * s_12 + 8 * s_14 * s_23 + 8 * s_13 * s_24 + 8...


In [19]:
# Training function
def train_model(model, train_loader, val_loader, epochs=3, lr=5e-5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    # Training history
    history = {
        'train_loss': [],
        'val_loss': []
    }
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        
        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            # Print progress every 10 batches
            if (batch_idx + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{epochs} | Batch {batch_idx+1}/{len(train_loader)} | Loss: {loss.item():.4f}")
        
        avg_train_loss = train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                val_loss += outputs.loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        history['val_loss'].append(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train loss: {avg_train_loss:.4f}")
        print(f"Validation loss: {avg_val_loss:.4f}")
        print("-" * 50)
    
    return model, history

In [20]:
# Visualize training history
def plot_training_history(history):
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

In [21]:
# Evaluate sequence accuracy
def evaluate_sequence_accuracy(model, data_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=512
            )
            
            # Decode predictions and targets
            predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in outputs]
            targets = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
            
            all_predictions.extend(predictions)
            all_targets.extend(targets)
            
            # Print progress
            if (batch_idx + 1) % 5 == 0:
                print(f"Evaluated {batch_idx+1}/{len(data_loader)} batches")
    
    # Calculate sequence accuracy
    exact_matches = sum(1 for pred, target in zip(all_predictions, all_targets) if pred == target)
    sequence_accuracy = exact_matches / len(all_targets)
    
    # Return accuracy and some examples for inspection
    return sequence_accuracy, all_predictions[:5], all_targets[:5]


In [22]:
# Train the model
# Set epochs to a small number for initial testing, increase for better results
trained_model, history = train_model(model, train_loader, val_loader, epochs=2, lr=5e-5)

# Plot training history
plot_training_history(history)

Using device: cpu


KeyboardInterrupt: 

In [None]:
# Evaluate on test set
test_accuracy, sample_predictions, sample_targets = evaluate_sequence_accuracy(trained_model, test_loader)
print(f"Test sequence accuracy: {test_accuracy:.4f}")

# Display some example predictions
print("\nSample predictions vs targets:")
for i, (pred, target) in enumerate(zip(sample_predictions, sample_targets)):
    print(f"\nExample {i+1}:")
    print(f"Prediction: {pred[:100]}..." if len(pred) > 100 else f"Prediction: {pred}")
    print(f"Target: {target[:100]}..." if len(target) > 100 else f"Target: {target}")
    print(f"Correct: {pred == target}")


In [None]:
# Save the model and tokenizer
torch.save(trained_model.state_dict(), 'transformer_model.pt')
tokenizer.save_pretrained('amplitude_tokenizer')

print("Model and tokenizer saved successfully!")