# Experiment 004: Character-Level WaveNet Model

**Objective**: Implement character-level WaveNet refinement to improve span boundary detection

**Expected gain**: +0.015 to +0.025 points
**Priority**: HIGHEST (addresses core weakness in boundary detection)

**Approach**:
1. Modify RoBERTa inference to save token start/end probability distributions
2. Convert token probabilities to character-level probabilities using offset_mapping
3. Build WaveNet with dilated convolutions for smoothing
4. Train on 5-fold CV using character-level probabilities as features
5. Generate refined predictions with better boundary detection

**Reference**: Winning solution by Theo Viel (dark-of-the-moon) - Character-level WaveNet was key innovation

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import json
import sys
import re

print("Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Define Jaccard similarity metric
def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity between two strings."""
    if pd.isna(str1) or pd.isna(str2):
        return 0.0
    
    str1, str2 = str(str1), str(str2)
    
    # Tokenize by splitting on whitespace
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    
    # Handle empty sets
    if len(a) == 0 and len(b) == 0:
        return 1.0
    
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
# Load training data
train_path = Path('/home/data/train.csv')
train_df = pd.read_csv(train_path)

print(f"Loaded {len(train_df)} training samples")
print(f"Columns: {list(train_df.columns)}")
print(f"\nSentiment distribution:")
print(train_df['sentiment'].value_counts())

# Show sample
print(f"\nSample row:")
print(train_df.iloc[0])

In [None]:
# Initialize tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
print(f"Tokenizer loaded: {tokenizer.name_or_path}")

# Test tokenization with offset mapping
text = train_df.iloc[0]['text']
selected_text = train_df.iloc[0]['selected_text']

print(f"\nOriginal text: '{text}'")
print(f"Selected text: '{selected_text}'")

# Tokenize with offset mapping
encoding = tokenizer(
    text,
    return_offsets_mapping=True,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=128
)

print(f"\nToken IDs shape: {encoding['input_ids'].shape}")
print(f"Offset mapping shape: {encoding['offset_mapping'].shape}")
print(f"\nFirst 10 tokens and their offsets:")
for i in range(min(10, len(encoding['input_ids'][0]))):
    token_id = encoding['input_ids'][0][i].item()
    token = tokenizer.decode([token_id])
    offsets = encoding['offset_mapping'][0][i]
    print(f"  {i}: '{token}' -> offsets: {offsets}")

In [None]:
# Function to extract token start/end probabilities from RoBERTa
@torch.no_grad()
def extract_token_probabilities(model, text, sentiment, tokenizer, device='cuda'):
    """
    Extract token-level start and end probability distributions from RoBERTa.
    Returns probabilities for each token position.
    """
    # Prepare input with sentiment token
    input_text = f"{sentiment} {text}"
    encoding = tokenizer(
        input_text,
        return_offsets_mapping=True,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    offset_mapping = encoding['offset_mapping'][0]  # Remove batch dim
    
    # Get model outputs
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    start_logits = outputs.start_logits[0]  # [seq_len]
    end_logits = outputs.end_logits[0]      # [seq_len]
    
    # Convert to probabilities
    start_probs = F.softmax(start_logits, dim=0).cpu().numpy()
    end_probs = F.softmax(end_logits, dim=0).cpu().numpy()
    
    return {
        'start_probs': start_probs,  # [seq_len]
        'end_probs': end_probs,      # [seq_len]
        'offset_mapping': offset_mapping.cpu().numpy(),  # [seq_len, 2]
        'input_ids': input_ids.cpu().numpy(),
        'attention_mask': attention_mask.cpu().numpy()
    }

# Test the function
print("Testing token probability extraction...")

# Load a trained RoBERTa model (from exp_002)
model_path = Path('/home/code/experiments/002_roberta_span/fold_0_model.pt')
if model_path.exists():
    model = torch.load(model_path, map_location='cpu')
    model.eval()
    print(f"Loaded model from {model_path}")
else:
    # If no saved model, load pretrained
    print("No saved model found, loading pretrained RoBERTa...")
    model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Test on a sample
test_text = train_df.iloc[0]['text']
test_sentiment = train_df.iloc[0]['sentiment']

probs = extract_token_probabilities(model, test_text, test_sentiment, tokenizer)
print(f"\nStart probabilities shape: {probs['start_probs'].shape}")
print(f"End probabilities shape: {probs['end_probs'].shape}")
print(f"Offset mapping shape: {probs['offset_mapping'].shape}")

# Show top predictions
start_token = np.argmax(probs['start_probs'])
end_token = np.argmax(probs['end_probs'])
print(f"\nPredicted start token: {start_token} (prob: {probs['start_probs'][start_token]:.4f})")
print(f"Predicted end token: {end_token} (prob: {probs['end_probs'][end_token]:.4f})")

In [None]:
# Function to convert token probabilities to character probabilities
def token_to_char_probabilities(token_probs, offset_mapping, text_length):
    """
    Convert token-level probabilities to character-level probabilities.
    
    Args:
        token_probs: [seq_len] array of token probabilities
        offset_mapping: [seq_len, 2] array of (start_char, end_char) offsets
        text_length: Length of original text
        
    Returns:
        char_probs: [text_length] array of character-level probabilities
    """
    char_probs = np.zeros(text_length)
    char_counts = np.zeros(text_length)
    
    # Aggregate probabilities from tokens to characters
    for token_idx, (char_start, char_end) in enumerate(offset_mapping):
        # Skip special tokens (offset = (0, 0))
        if char_start == 0 and char_end == 0:
            continue
            
        # Ensure offsets are within bounds
        char_start = max(0, min(char_start, text_length - 1))
        char_end = max(0, min(char_end, text_length))
        
        if char_end > char_start:
            # Distribute token probability to characters
            prob_per_char = token_probs[token_idx] / (char_end - char_start)
            char_probs[char_start:char_end] += prob_per_char
            char_counts[char_start:char_end] += 1
    
    # Average probabilities where multiple tokens overlap
    mask = char_counts > 0
    char_probs[mask] /= char_counts[mask]
    
    return char_probs

# Test the conversion function
print("Testing token to character probability conversion...")

text_length = len(test_text)
char_start_probs = token_to_char_probabilities(
    probs['start_probs'], 
    probs['offset_mapping'], 
    text_length
)
char_end_probs = token_to_char_probabilities(
    probs['end_probs'], 
    probs['offset_mapping'], 
    text_length
)

print(f"Character-level start probabilities shape: {char_start_probs.shape}")
print(f"Character-level end probabilities shape: {char_end_probs.shape}")

# Show character probabilities for first 50 characters
print(f"\nFirst 50 characters of text: '{test_text[:50]}'")
print("Character start probabilities (first 20):")
for i in range(min(20, text_length)):
    print(f"  Char {i} ('{test_text[i]}'): {char_start_probs[i]:.4f}")

In [None]:
# WaveNet architecture for character-level probability refinement
class CharacterWaveNet(nn.Module):
    def __init__(self, input_channels=2, num_classes=2, num_blocks=4, num_layers=6, 
                 residual_channels=32, gate_channels=32, skip_channels=32):
        """
        WaveNet for character-level span prediction refinement.
        
        Args:
            input_channels: Number of input channels (start_prob, end_prob)
            num_classes: Number of output classes (refined_start, refined_end)
            num_blocks: Number of residual blocks
            num_layers: Number of layers per block (dilated convolutions)
            residual_channels: Channels in residual connections
            gate_channels: Channels in gated activation units
            skip_channels: Channels in skip connections
        """
        super(CharacterWaveNet, self).__init__()
        
        self.num_blocks = num_blocks
        self.num_layers = num_layers
        
        # Input projection
        self.input_projection = nn.Conv1d(input_channels, residual_channels, 1)
        
        # Dilated convolution layers
        self.dilated_convs = nn.ModuleList()
        self.residual_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()
        
        # Build dilated convolution blocks
        for b in range(num_blocks):
            for l in range(num_layers):
                # Dilation doubles each layer: 1, 2, 4, 8, 16, 32, ...
                dilation = 2 ** l
                
                # Gated activation unit (dilated convolution)
                self.dilated_convs.append(
                    nn.Conv1d(residual_channels, gate_channels, kernel_size=3, 
                             padding=dilation, dilation=dilation)
                )
                
                # 1x1 conv for residual connection
                self.residual_convs.append(
                    nn.Conv1d(gate_channels // 2, residual_channels, 1)
                )
                
                # 1x1 conv for skip connection
                self.skip_convs.append(
                    nn.Conv1d(gate_channels // 2, skip_channels, 1)
                )
        
        # Output layers
        self.output_conv1 = nn.Conv1d(skip_channels, skip_channels, 1)
        self.output_conv2 = nn.Conv1d(skip_channels, num_classes, 1)
        
        # Activation functions
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        
    def forward(self, x):
        """
        Forward pass.
        
        Args:
            x: Input tensor [batch, input_channels, seq_len]
            
        Returns:
            output: Output tensor [batch, num_classes, seq_len]
        """
        # Input projection
        x = self.input_projection(x)  # [batch, residual_channels, seq_len]
        
        # Skip connections accumulator
        skip_connections = []
        
        # Apply dilated convolutions
        layer_idx = 0
        for b in range(self.num_blocks):
            for l in range(self.num_layers):
                # Gated activation: tanh(x) * sigmoid(x)
                dilated_out = self.dilated_convs[layer_idx](x)
                
                # Split into two parts for gating
                tanh_out = self.tanh(dilated_out[:, :dilated_out.shape[1]//2, :])
                sigmoid_out = self.sigmoid(dilated_out[:, dilated_out.shape[1]//2:, :])
                
                gated_out = tanh_out * sigmoid_out
                
                # Residual connection
                residual_out = self.residual_convs[layer_idx](gated_out)
                x = x + residual_out
                
                # Skip connection
                skip_out = self.skip_convs[layer_idx](gated_out)
                skip_connections.append(skip_out)
                
                layer_idx += 1
        
        # Sum all skip connections
        skip_sum = sum(skip_connections)  # [batch, skip_channels, seq_len]
        
        # Output layers
        output = self.relu(skip_sum)
        output = self.output_conv1(output)
        output = self.relu(output)
        output = self.output_conv2(output)
        
        # Apply softmax to get probabilities
        output = F.softmax(output, dim=1)  # [batch, num_classes, seq_len]
        
        return output

# Test the WaveNet architecture
print("Testing WaveNet architecture...")

batch_size = 2
seq_len = 100
input_channels = 2

# Create dummy input (character-level start/end probabilities)
dummy_input = torch.randn(batch_size, input_channels, seq_len)

# Initialize model
model = CharacterWaveNet(
    input_channels=input_channels,
    num_classes=2,
    num_blocks=2,  # Reduced for testing
    num_layers=4,   # Reduced for testing
    residual_channels=16,
    gate_channels=16,
    skip_channels=16
)

# Forward pass
with torch.no_grad():
    output = model(dummy_input)

print(f"Input shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Output sum (should be 1.0 per position): {output[0, :, 0].sum().item():.4f}")