# Part 1: Preprocessing

In [1]:
# Import necessary libraries
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter


In [3]:
# Function to load and parse data from all files
def load_data(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split(' : ')
                if len(parts) == 4:
                    event_type, diagram, amplitude, squared_amplitude = parts
                    data.append({
                        'event_type': event_type,
                        'diagram': diagram,
                        'amplitude': amplitude.strip(),
                        'squared_amplitude': squared_amplitude.strip()
                    })
    return pd.DataFrame(data)


In [4]:
def normalize_indices(expr):
    # Find all patterns like %something_number
    pattern = r'(%[^_]+)_(\d+)'
    matches = re.findall(pattern, expr)
    
    # Get unique numeric indices for each variable type
    var_indices = {}
    for var_type, num_idx in matches:
        if var_type not in var_indices:
            var_indices[var_type] = set()
        var_indices[var_type].add(num_idx)
    
    # Create mapping from original indices to normalized ones for each variable type
    index_maps = {}
    for var_type, indices in var_indices.items():
        sorted_indices = sorted(indices, key=int)
        index_maps[var_type] = {orig_idx: str(i+1) for i, orig_idx in enumerate(sorted_indices)}
    
    # Replace indices according to the mapping
    def replace_match(m):
        var_type, num_idx = m.groups()
        return f"{var_type}_{index_maps[var_type][num_idx]}"
    
    normalized_expr = re.sub(pattern, replace_match, expr)
    
    return normalized_expr





In [5]:
# Tokenization function for mathematical expressions
def tokenize_expression(expr):
    # Define patterns for different token types
    patterns = [
        # Constants and numbers
        r'(\d+/\d+|\d+\.\d+|\d+)',
        # Variables with indices and special notations
        r'([a-zA-Z]+(?:_[a-zA-Z0-9]+)?(?:\^\([*]\))?)',
        # Mathematical operators and symbols
        r'([\+\-\*/\^\(\)\[\]\{\}])',
        # Special symbols and groupings
        r'(%[a-zA-Z]+_\d+|_{[^}]+})'
    ]
    
    # Combine patterns
    combined_pattern = '|'.join(patterns)
    tokens = re.findall(combined_pattern, expr)
    
    # Flatten and filter empty strings
    tokens = [t for sublist in tokens for t in sublist if t]
    return tokens


In [None]:
# Example of how the tokenization works
example_expr = "-1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_146}*gamma_{%\sigma_165,%gam_147,%del_137}*e_{i_3,%gam_146}(p_1)_u*e_{k_3,%del_137}(p_2)_u*e_{l_3,%gam_145}(p_3)_u^(*)*e_{i_5,%gam_147}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)"
tokens = tokenize_expression(example_expr)
print("Original expression:", example_expr)
print("Tokenized expression:", tokens)


Original expression: -1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_146}*gamma_{%\sigma_165,%gam_147,%del_137}*e_{i_3,%gam_146}(p_1)_u*e_{k_3,%del_137}(p_2)_u*e_{l_3,%gam_145}(p_3)_u^(*)*e_{i_5,%gam_147}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)
Tokenized expression: ['-', '1/2', '*', 'i', '*', 'e', '^', '2', '*', 'gamma', '_{+%\\sigma_165,%gam_145,%gam_146}', '*', 'gamma', '_{%\\sigma_165,%gam_147,%del_137}', '*', 'e', '_{i_3,%gam_146}', '(', 'p_1', ')', 'u', '*', 'e', '_{k_3,%del_137}', '(', 'p_2', ')', 'u', '*', 'e', '_{l_3,%gam_145}', '(', 'p_3', ')', 'u^(*)', '*', 'e', '_{i_5,%gam_147}', '(', 'p_4', ')', 'u^(*)', '/', '(', 'm_e', '^', '2', '+', '-', 's_13', '+', '1/2', '*', 'reg_prop', ')']


In [7]:
example_expr = "-i*e^2*gamma_{+%\sigma_157721,%gam_166722,%eps_44575}*gamma_{%\sigma_157721,%gam_166723,%del_106099}*e_{i_36289,%del_106099}(p_3)_v*e_{k_36277,%gam_166723}(p_1)_v^(*)*mu_{l_36277,%gam_166722}(p_2)_v^(*)*mu_{j_36269,%eps_44575}(p_4)_v/(m_e^2 + (-2)*s_13 + s_33 + reg_prop)"
normalized = normalize_indices(example_expr)
print("Original expression:", example_expr)
print("Normalized expression:", normalized)

Original expression: -i*e^2*gamma_{+%\sigma_157721,%gam_166722,%eps_44575}*gamma_{%\sigma_157721,%gam_166723,%del_106099}*e_{i_36289,%del_106099}(p_3)_v*e_{k_36277,%gam_166723}(p_1)_v^(*)*mu_{l_36277,%gam_166722}(p_2)_v^(*)*mu_{j_36269,%eps_44575}(p_4)_v/(m_e^2 + (-2)*s_13 + s_33 + reg_prop)
Normalized expression: -i*e^2*gamma_{+%\sigma_1,%gam_1,%eps_1}*gamma_{%\sigma_1,%gam_2,%del_1}*e_{i_36289,%del_1}(p_3)_v*e_{k_36277,%gam_2}(p_1)_v^(*)*mu_{l_36277,%gam_1}(p_2)_v^(*)*mu_{j_36269,%eps_1}(p_4)_v/(m_e^2 + (-2)*s_13 + s_33 + reg_prop)


In [8]:
file_paths = [f"SYMBA - Test Data\QED-2-to-2-diag-TreeLevel-{i}.txt" for i in range(0, 10)]

# Load the sample data
df = load_data(file_paths)

# Display the first few rows
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (15552, 4)


Unnamed: 0,event_type,diagram,amplitude,squared_amplitude
0,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_1:e(X_2), e(X_4), OffShell A(V_1), V...","-1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_1...",2*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23...
1,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_0:e(X_2), e(X_3), OffShell A(V_0), V...","1/2*i*e^2*gamma_{+%\sigma_172,%gam_162,%del_14...",2*e^4*(m_e^4 + -1/2*m_e^2*s_14 + -1/2*m_e^2*s_...
2,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_1:e(X_2), OffShell e(X_4), OffShell...","-1/2*i*e^2*gamma_{+%\sigma_293,%gam_358,%gam_3...",2*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23...
3,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_0:e(X_2), e(X_3), OffShell A(V_0), V...","1/2*i*e^2*gamma_{+%\sigma_301,%gam_377,%del_27...",2*e^4*(m_e^4 + -1/2*m_e^2*s_14 + -1/2*m_e^2*s_...
4,Interaction: e_gam_239(X)^(*) e_del_219(X)^(...,"Vertex V_1:e(X_2), e(X_4), OffShell A(V_1), V...","-i*e^2*gamma_{+%\sigma_435,%gam_574,%gam_575}*...",8*e^4*(m_e^4 + -1/2*m_e^2*s_13 + 1/2*s_14*s_23...


In [9]:
# Normalize indices in amplitudes and squared amplitudes
df['normalized_amplitude'] = df['amplitude'].apply(normalize_indices)
df['normalized_squared_amplitude'] = df['squared_amplitude'].apply(normalize_indices)

# Display an example of normalization
print("Original amplitude:")
print(df['amplitude'].iloc[0])
print("\nNormalized amplitude:")
print(df['normalized_amplitude'].iloc[0])

Original amplitude:
-1/2*i*e^2*gamma_{+%\sigma_165,%gam_145,%gam_146}*gamma_{%\sigma_165,%gam_147,%del_137}*e_{i_3,%gam_146}(p_1)_u*e_{k_3,%del_137}(p_2)_u*e_{l_3,%gam_145}(p_3)_u^(*)*e_{i_5,%gam_147}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)

Normalized amplitude:
-1/2*i*e^2*gamma_{+%\sigma_1,%gam_1,%gam_2}*gamma_{%\sigma_1,%gam_3,%del_1}*e_{i_3,%gam_2}(p_1)_u*e_{k_3,%del_1}(p_2)_u*e_{l_3,%gam_1}(p_3)_u^(*)*e_{i_5,%gam_3}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)


In [10]:
# Tokenize normalized expressions
df['tokenized_amplitude'] = df['normalized_amplitude'].apply(tokenize_expression)
df['tokenized_squared_amplitude'] = df['normalized_squared_amplitude'].apply(tokenize_expression)

# Display an example of tokenization
print("Normalized amplitude:")
print(df['normalized_amplitude'].iloc[0])
print("\nTokenized amplitude (first 20 tokens):")
print(df['tokenized_amplitude'].iloc[0][:20])

Normalized amplitude:
-1/2*i*e^2*gamma_{+%\sigma_1,%gam_1,%gam_2}*gamma_{%\sigma_1,%gam_3,%del_1}*e_{i_3,%gam_2}(p_1)_u*e_{k_3,%del_1}(p_2)_u*e_{l_3,%gam_1}(p_3)_u^(*)*e_{i_5,%gam_3}(p_4)_u^(*)/(m_e^2 + -s_13 + 1/2*reg_prop)

Tokenized amplitude (first 20 tokens):
['-', '1/2', '*', 'i', '*', 'e', '^', '2', '*', 'gamma', '_{+%\\sigma_1,%gam_1,%gam_2}', '*', 'gamma', '_{%\\sigma_1,%gam_3,%del_1}', '*', 'e', '_{i_3,%gam_2}', '(', 'p_1', ')']


In [11]:
# Split into train, validation, and test sets (80-10-10)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 12441
Validation set size: 1555
Test set size: 1556


In [12]:
# Analyze token distribution
all_tokens = []
for tokens in df['tokenized_amplitude'] + df['tokenized_squared_amplitude']:
    all_tokens.extend(tokens)

token_counts = Counter(all_tokens)
print(f"Total unique tokens: {len(token_counts)}")
print(f"Most common tokens: {token_counts.most_common(10)}")


Total unique tokens: 44416
Most common tokens: [('*', 575442), ('(', 285768), (')', 285768), ('+', 272448), ('2', 215021), ('^', 198555), ('-', 168857), ('e', 70464), ('reg_prop', 62784), ('gamma', 58752)]


In [13]:
# Dump into a pickle file
outout_dir = "data"
os.makedirs(outout_dir, exist_ok=True)
train_df.to_pickle(os.path.join(outout_dir, "train.pkl"))
val_df.to_pickle(os.path.join(outout_dir, "val.pkl"))
test_df.to_pickle(os.path.join(outout_dir, "test.pkl"))
