# Experiment 004: Character-Level WaveNet Model

**Objective**: Implement character-level WaveNet refinement to improve span boundary detection

**Expected gain**: +0.015 to +0.025 points
**Priority**: HIGHEST (addresses core weakness in boundary detection)

**Approach**:
1. Modify RoBERTa inference to save token start/end probability distributions
2. Convert token probabilities to character-level probabilities using offset_mapping
3. Build WaveNet with dilated convolutions for smoothing
4. Train on 5-fold CV using character-level probabilities as features
5. Generate refined predictions with better boundary detection

**Reference**: Winning solution by Theo Viel (dark-of-the-moon) - Character-level WaveNet was key innovation

In [41]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import json
import sys
import re

print("Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Define Jaccard similarity metric
def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity between two strings."""
    if pd.isna(str1) or pd.isna(str2):
        return 0.0
    
    str1, str2 = str(str1), str(str2)
    
    # Tokenize by splitting on whitespace
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    
    # Handle empty sets
    if len(a) == 0 and len(b) == 0:
        return 1.0
    
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

Libraries imported successfully
PyTorch version: 2.2.0+cu118
CUDA available: False


In [42]:
# Load training data
train_path = Path('/home/data/train.csv')
train_df = pd.read_csv(train_path)

print(f"Loaded {len(train_df)} training samples")
print(f"Columns: {list(train_df.columns)}")
print(f"\nSentiment distribution:")
print(train_df['sentiment'].value_counts())

# Show sample
print(f"\nSample row:")
print(train_df.iloc[0])

Loaded 27481 training samples
Columns: ['textID', 'text', 'selected_text', 'sentiment']

Sentiment distribution:
sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

Sample row:
textID                                     cb774db0d1
text              I`d have responded, if I were going
selected_text     I`d have responded, if I were going
sentiment                                     neutral
Name: 0, dtype: object


In [43]:
# Initialize tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
print(f"Tokenizer loaded: {tokenizer.name_or_path}")

# Test tokenization with offset mapping
text = train_df.iloc[0]['text']
selected_text = train_df.iloc[0]['selected_text']

print(f"\nOriginal text: '{text}'")
print(f"Selected text: '{selected_text}'")

# Tokenize with offset mapping
encoding = tokenizer(
    text,
    return_offsets_mapping=True,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=128
)

print(f"\nToken IDs shape: {encoding['input_ids'].shape}")
print(f"Offset mapping shape: {encoding['offset_mapping'].shape}")
print(f"\nFirst 10 tokens and their offsets:")
for i in range(min(10, len(encoding['input_ids'][0]))):
    token_id = encoding['input_ids'][0][i].item()
    token = tokenizer.decode([token_id])
    offsets = encoding['offset_mapping'][0][i]
    print(f"  {i}: '{token}' -> offsets: {offsets}")

Tokenizer loaded: roberta-base

Original text: ' I`d have responded, if I were going'
Selected text: 'I`d have responded, if I were going'

Token IDs shape: torch.Size([1, 12])
Offset mapping shape: torch.Size([1, 12, 2])

First 10 tokens and their offsets:
  0: '<s>' -> offsets: tensor([0, 0])
  1: ' I' -> offsets: tensor([1, 2])
  2: '`' -> offsets: tensor([2, 3])
  3: 'd' -> offsets: tensor([3, 4])
  4: ' have' -> offsets: tensor([5, 9])
  5: ' responded' -> offsets: tensor([10, 19])
  6: ',' -> offsets: tensor([19, 20])
  7: ' if' -> offsets: tensor([21, 23])
  8: ' I' -> offsets: tensor([24, 25])
  9: ' were' -> offsets: tensor([26, 30])


In [44]:
# Function to extract token start/end probabilities from RoBERTa
@torch.no_grad()
def extract_token_probabilities(model, text, sentiment, tokenizer, device='cpu'):
    """
    Extract token-level start and end probability distributions from RoBERTa.
    Returns probabilities for each token position.
    """
    # Prepare input with sentiment token
    input_text = f"{sentiment} {text}"
    encoding = tokenizer(
        input_text,
        return_offsets_mapping=True,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    offset_mapping = encoding['offset_mapping'][0]  # Remove batch dim
    
    # Get model outputs
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    start_logits = outputs.start_logits[0]  # [seq_len]
    end_logits = outputs.end_logits[0]      # [seq_len]
    
    # Convert to probabilities
    start_probs = F.softmax(start_logits, dim=-1)
    end_probs = F.softmax(end_logits, dim=-1)
    
    return start_probs, end_probs, offset_mapping

# Test the function
print("Testing token probability extraction...")
test_text = train_df.iloc[0]['text']
test_sentiment = train_df.iloc[0]['sentiment']

# Load a simple model for testing
try:
    roberta_model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
    print("Loaded pretrained RoBERTa for testing")
except Exception as e:
    print(f"Error loading model: {e}")
    roberta_model = None

if roberta_model is not None:
    start_probs, end_probs, offset_mapping = extract_token_probabilities(
        roberta_model, test_text, test_sentiment, tokenizer, device='cpu'
    )
    print(f"Start probs shape: {start_probs.shape}")
    print(f"End probs shape: {end_probs.shape}")
    print(f"Offset mapping shape: {offset_mapping.shape}")
    print(f"First 5 start probabilities: {start_probs[:5]}")
    print("Token probability extraction test completed successfully")

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing token probability extraction...
Loaded pretrained RoBERTa for testing


Start probs shape: torch.Size([14])
End probs shape: torch.Size([14])
Offset mapping shape: torch.Size([14, 2])
First 5 start probabilities: tensor([0.0778, 0.0857, 0.0822, 0.0617, 0.0889])
Token probability extraction test completed successfully


In [45]:
# Function to convert token probabilities to character probabilities
def token_to_char_probabilities(token_probs, offset_mapping, text_length):
    """
    Convert token-level probabilities to character-level probabilities.
    
    Args:
        token_probs: [seq_len] array of token probabilities
        offset_mapping: [seq_len, 2] array of (start_char, end_char) offsets
        text_length: Length of original text
        
    Returns:
        char_probs: [text_length] array of character-level probabilities
    """
    char_probs = np.zeros(text_length)
    char_counts = np.zeros(text_length)
    
    # Convert token_probs to numpy if it's a tensor
    if isinstance(token_probs, torch.Tensor):
        token_probs = token_probs.cpu().numpy()
    
    # Convert offset_mapping to numpy if it's a tensor
    if isinstance(offset_mapping, torch.Tensor):
        offset_mapping = offset_mapping.cpu().numpy()
    
    # Aggregate probabilities from tokens to characters
    for token_idx, (char_start, char_end) in enumerate(offset_mapping):
        # Skip special tokens (offset = (0, 0))
        if char_start == 0 and char_end == 0:
            continue
            
        # Ensure offsets are within bounds
        char_start = max(0, min(char_start, text_length - 1))
        char_end = max(0, min(char_end, text_length - 1))
        
        if char_start < char_end:
            # Distribute token probability across characters
            num_chars = char_end - char_start
            char_probs[char_start:char_end] += token_probs[token_idx] / num_chars
            char_counts[char_start:char_end] += 1
    
    # Avoid division by zero
    char_counts = np.maximum(char_counts, 1)
    char_probs = char_probs / char_counts
    
    return char_probs

# Simple test without complex printing
print("Testing token to character probability conversion...")
test_start_probs, test_end_probs, test_offset_mapping = extract_token_probabilities(
    roberta_model, test_text, test_sentiment, tokenizer, device='cpu'
)

print(f"Token probs type: {type(test_start_probs)}")
print(f"Offset mapping type: {type(test_offset_mapping)}")

char_start_probs = token_to_char_probabilities(
    test_start_probs, test_offset_mapping, len(test_text)
)

print(f"Character-level probabilities shape: {char_start_probs.shape}")
print("Conversion test completed successfully")

Testing token to character probability conversion...
Token probs type: <class 'torch.Tensor'>
Offset mapping type: <class 'torch.Tensor'>
Character-level probabilities shape: (36,)
Conversion test completed successfully


In [46]:
# WaveNet architecture for character-level probability refinement
class CharacterWaveNet(nn.Module):
    def __init__(self, input_channels=2, num_classes=2, num_blocks=4, num_layers=6, 
                 residual_channels=32, gate_channels=32, skip_channels=32):
        """
        WaveNet for character-level span prediction refinement.
        
        Args:
            input_channels: Number of input channels (start_prob, end_prob)
            num_classes: Number of output classes (refined_start, refined_end)
            num_blocks: Number of residual blocks
            num_layers: Number of layers per block (dilated convolutions)
            residual_channels: Channels in residual connections
            gate_channels: Channels in gated activation units
            skip_channels: Channels in skip connections
        """
        super(CharacterWaveNet, self).__init__()
        
        self.num_blocks = num_blocks
        self.num_layers = num_layers
        
        # Input projection
        self.input_projection = nn.Conv1d(input_channels, residual_channels, 1)
        
        # Dilated convolution layers
        self.dilated_convs = nn.ModuleList()
        self.residual_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()
        
        # Build dilated convolution blocks
        for b in range(num_blocks):
            for l in range(num_layers):
                # Dilation doubles each layer: 1, 2, 4, 8, 16, 32, ...
                dilation = 2 ** l
                
                # Gated activation unit (dilated convolution)
                self.dilated_convs.append(
                    nn.Conv1d(residual_channels, gate_channels, kernel_size=3, 
                             padding=dilation, dilation=dilation)
                )
                
                # 1x1 conv for residual connection
                self.residual_convs.append(
                    nn.Conv1d(gate_channels // 2, residual_channels, 1)
                )
                
                # 1x1 conv for skip connection
                self.skip_convs.append(
                    nn.Conv1d(gate_channels // 2, skip_channels, 1)
                )
        
        # Output layers
        self.output_conv1 = nn.Conv1d(skip_channels, skip_channels, 1)
        self.output_conv2 = nn.Conv1d(skip_channels, num_classes, 1)
        
        # Activation functions
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        
    def forward(self, x):
        """
        Forward pass.
        
        Args:
            x: Input tensor [batch, input_channels, seq_len]
            
        Returns:
            output: Output tensor [batch, num_classes, seq_len]
        """
        # Input projection
        x = self.input_projection(x)  # [batch, residual_channels, seq_len]
        
        # Skip connections accumulator
        skip_connections = []
        
        # Apply dilated convolutions
        layer_idx = 0
        for b in range(self.num_blocks):
            for l in range(self.num_layers):
                # Gated activation: tanh(x) * sigmoid(x)
                dilated_out = self.dilated_convs[layer_idx](x)
                
                # Split into two parts for gating
                tanh_out = self.tanh(dilated_out[:, :dilated_out.shape[1]//2, :])
                sigmoid_out = self.sigmoid(dilated_out[:, dilated_out.shape[1]//2:, :])
                
                gated_out = tanh_out * sigmoid_out
                
                # Residual connection
                residual_out = self.residual_convs[layer_idx](gated_out)
                x = x + residual_out
                
                # Skip connection
                skip_out = self.skip_convs[layer_idx](gated_out)
                skip_connections.append(skip_out)
                
                layer_idx += 1
        
        # Sum all skip connections
        skip_sum = sum(skip_connections)  # [batch, skip_channels, seq_len]
        
        # Output layers
        output = self.relu(skip_sum)
        output = self.output_conv1(output)
        output = self.relu(output)
        output = self.output_conv2(output)
        
        # Apply softmax to get probabilities
        output = F.softmax(output, dim=1)  # [batch, num_classes, seq_len]
        
        return output

# Test the WaveNet architecture
print("Testing WaveNet architecture...")

batch_size = 2
seq_len = 100
input_channels = 2

# Create dummy input (character-level start/end probabilities)
dummy_input = torch.randn(batch_size, input_channels, seq_len)

# Initialize WaveNet model (use different variable name to avoid confusion)
wavenet_model = CharacterWaveNet(
    input_channels=input_channels,
    num_classes=2,
    num_blocks=2,  # Reduced for testing
    num_layers=4,   # Reduced for testing
    residual_channels=16,
    gate_channels=16,
    skip_channels=16
)

# Forward pass
with torch.no_grad():
    output = wavenet_model(dummy_input)

print(f"Input shape: {dummy_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Output sum (should be 1.0 per position): {output[0, :, 0].sum().item():.4f}")

Testing WaveNet architecture...
Input shape: torch.Size([2, 2, 100])
Output shape: torch.Size([2, 2, 100])
Output sum (should be 1.0 per position): 1.0000


In [47]:
# Dataset for character-level training
class CharacterLevelDataset(Dataset):
    def __init__(self, texts, sentiments, selected_texts, roberta_model, tokenizer, device='cpu'):
        """
        Dataset for character-level WaveNet training.
        
        Args:
            texts: List of tweet texts
            sentiments: List of sentiment labels
            selected_texts: List of selected_text spans (targets)
            roberta_model: Trained RoBERTa model for generating token probabilities
            tokenizer: Tokenizer for text processing
        """
        self.texts = texts
        self.sentiments = sentiments
        self.selected_texts = selected_texts
        self.roberta_model = roberta_model
        self.tokenizer = tokenizer
        self.device = device
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        sentiment = self.sentiments[idx]
        selected_text = self.selected_texts[idx]
        
        # Extract token probabilities from RoBERTa
        start_probs, end_probs, offset_mapping = extract_token_probabilities(
            self.roberta_model, text, sentiment, self.tokenizer, self.device
        )
        
        # Convert to character probabilities
        char_start_probs = token_to_char_probabilities(
            start_probs.cpu().numpy(), offset_mapping, len(text)
        )
        char_end_probs = token_to_char_probabilities(
            end_probs.cpu().numpy(), offset_mapping, len(text)
        )
        
        # Create input features [2, text_length]
        char_features = np.stack([char_start_probs, char_end_probs], axis=0)
        
        # Create target labels (binary masks for start/end positions)
        start_target = np.zeros(len(text))
        end_target = np.zeros(len(text))
        
        if sentiment != 'neutral':
            # Find selected text boundaries
            try:
                start_idx = text.index(selected_text)
                end_idx = start_idx + len(selected_text) - 1
                start_target[start_idx] = 1.0
                end_target[end_idx] = 1.0
            except:
                # Handle cases where selected_text is not found
                pass
        
        target = np.stack([start_target, end_target], axis=0)
        
        return {
            'features': torch.FloatTensor(char_features),
            'target': torch.FloatTensor(target),
            'text': text,
            'selected_text': selected_text
        }

# Test dataset creation
print("Testing dataset creation...")

# Load a trained RoBERTa model if available
try:
    # Try to load the model from experiment 002
    model_path = Path('/home/code/experiments/002_roberta_span/fold_0_roberta_model.pt')
    if model_path.exists():
        roberta_model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
        checkpoint = torch.load(model_path, map_location='cpu')
        roberta_model.load_state_dict(checkpoint['model_state_dict'])
        roberta_model.eval()
        print(f"Loaded trained RoBERTa model from {model_path}")
    else:
        print("No trained model found, using pretrained RoBERTa")
        roberta_model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
except Exception as e:
    print(f"Error loading model: {e}")
    roberta_model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

# Create small test dataset
test_texts = train_df['text'].iloc[:100].tolist()
test_sentiments = train_df['sentiment'].iloc[:100].tolist()
test_selected = train_df['selected_text'].iloc[:100].tolist()

test_dataset = CharacterLevelDataset(
    test_texts, test_sentiments, test_selected, roberta_model, tokenizer, device='cpu'
)

print(f"Dataset created with {len(test_dataset)} samples")
sample = test_dataset[0]
print(f"Sample features shape: {sample['features'].shape}")
print(f"Sample target shape: {sample['target'].shape}")
print(f"Sample text: {sample['text'][:50]}...")
print(f"Sample selected_text: {sample['selected_text']}")

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing dataset creation...
No trained model found, using pretrained RoBERTa
Dataset created with 100 samples


Sample features shape: torch.Size([2, 36])
Sample target shape: torch.Size([2, 36])
Sample text:  I`d have responded, if I were going...
Sample selected_text: I`d have responded, if I were going


In [48]:
# Training function for WaveNet
def train_wavenet_model(model, train_loader, val_loader, device, epochs=10, lr=0.001):
    """Train WaveNet model on character-level features."""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            features = batch['features'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # Validation phase
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                features = batch['features'].to(device)
                targets = batch['target'].to(device)
                
                outputs = model(features)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_wavenet_model.pt')
            print(f"  Saved best model with val_loss = {val_loss:.4f}")
    
    return model

# Function to generate predictions with WaveNet refinement
def generate_wavenet_predictions(model, texts, sentiments, roberta_model, tokenizer, device='cpu'):
    """Generate refined predictions using WaveNet."""
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for i in tqdm(range(len(texts)), desc="Generating predictions"):
            text = texts[i]
            sentiment = sentiments[i]
            
            if sentiment == 'neutral':
                # For neutral sentiment, predict the entire text
                predictions.append(text)
                continue
            
            # Extract token probabilities from RoBERTa
            start_probs, end_probs, offset_mapping = extract_token_probabilities(
                roberta_model, text, sentiment, tokenizer, device
            )
            
            # Convert to character probabilities
            char_start_probs = token_to_char_probabilities(
                start_probs.cpu().numpy(), offset_mapping, len(text)
            )
            char_end_probs = token_to_char_probabilities(
                end_probs.cpu().numpy(), offset_mapping, len(text)
            )
            
            # Create input features
            char_features = torch.FloatTensor(
                np.stack([char_start_probs, char_end_probs], axis=0)
            ).unsqueeze(0).to(device)  # Add batch dimension
            
            # Apply WaveNet refinement
            refined_probs = torch.sigmoid(model(char_features)).squeeze(0).cpu().numpy()
            
            # Extract refined start and end positions
            refined_start_probs = refined_probs[0]
            refined_end_probs = refined_probs[1]
            
            # Find best span using refined probabilities
            start_idx = np.argmax(refined_start_probs)
            end_idx = np.argmax(refined_end_probs)
            
            # Ensure valid span
            if start_idx > end_idx:
                # Swap if needed
                start_idx, end_idx = end_idx, start_idx
            
            # Extract prediction
            prediction = text[start_idx:end_idx+1]
            
            # Handle edge cases
            if not prediction.strip():
                prediction = text
            
            predictions.append(prediction)
    
    return predictions

print("Training and prediction functions defined successfully")

Training and prediction functions defined successfully


In [50]:
# Simplified main execution: Single fold training for proof of concept
import warnings
warnings.filterwarnings('ignore')

def run_character_level_experiment():
    """Run simplified character-level WaveNet experiment for proof of concept."""
    print("="*60)
    print("Character-Level WaveNet Experiment (Proof of Concept)")
    print("="*60)
    
    # Configuration
    device = 'cpu'
    batch_size = 8  # Very small for CPU
    epochs = 2      # Very few epochs for speed
    lr = 0.001
    
    print(f"Device: {device}")
    print(f"Batch size: {batch_size}")
    print(f"Epochs: {epochs}")
    print(f"Learning rate: {lr}")
    print()
    
    try:
        # Load RoBERTa model (use pretrained since no trained model exists)
        print("Loading RoBERTa model...")
        roberta_model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
        roberta_model.eval()
        print("RoBERTa model loaded successfully\n")
        
        # Create small subset for quick testing
        subset_size = 100
        subset_idx = np.random.choice(len(train_df), subset_size, replace=False)
        texts = train_df['text'].values[subset_idx]
        sentiments = train_df['sentiment'].values[subset_idx]  # FIX: Use .values, not .value_counts().values
        selected_texts = train_df['selected_text'].values[subset_idx]
        
        print(f"Created subset of {subset_size} samples for testing\n")
        
        # Create datasets
        print("Creating character-level datasets...")
        train_dataset = CharacterLevelDataset(
            texts, sentiments, selected_texts, 
            roberta_model, tokenizer, device
        )
        
        # Use same data for validation (small experiment)
        val_dataset = train_dataset
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        print(f"Datasets created: {len(train_dataset)} train, {len(val_dataset)} val\n")
        
        # Initialize WaveNet model
        print("Initializing WaveNet model...")
        # Get input shape from first sample
        sample_features, _, sample_text = train_dataset[0]
        input_channels = sample_features.shape[0]
        
        wavenet_model = CharacterWaveNet(
            input_channels=input_channels,
            num_classes=2,
            num_blocks=2,  # Reduced for speed
            num_layers=4,  # Reduced for speed
            residual_channels=16,  # Reduced for speed
            gate_channels=16,
            skip_channels=16
        )
        print(f"WaveNet initialized with input_channels={input_channels}\n")
        
        # Train model
        print("Training WaveNet...")
        train_losses, val_losses = train_wavenet_model(
            wavenet_model, train_loader, val_loader, device, epochs, lr
        )
        print("\nTraining completed\n")
        
        # Evaluate on validation set
        print("Evaluating on validation set...")
        val_score, val_std, predictions, targets = evaluate_wavenet_model(
            wavenet_model, val_loader, device
        )
        print(f"Validation Jaccard Score: {val_score:.4f} ± {val_std:.4f}\n")
        
        # Show sample predictions
        print("Sample predictions:")
        for i in range(min(3, len(predictions))):
            orig_text = texts[i]
            true_span = selected_texts[i]
            pred_span = predictions[i]
            print(f"\n{i+1}. Original: '{orig_text[:80]}...'")
            print(f"   True: '{true_span}'")
            print(f"   Pred: '{pred_span}'")
        
        return val_score, val_std, predictions, targets
        
    except Exception as e:
        print(f"Error during experiment: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None, None

# Run the experiment
if __name__ == "__main__":
    val_score, val_std, predictions, targets = run_character_level_experiment()
    
    if val_score is not None:
        print("\n" + "="*60)
        print("EXPERIMENT SUMMARY")
        print("="*60)
        print(f"Final Validation Score: {val_score:.4f} ± {val_std:.4f}")
        print("\nNote: This is a proof-of-concept with limited training.")
        print("Full training would require GPU and more epochs.")
    else:
        print("\nExperiment failed. Check error messages above.")

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Character-Level WaveNet Experiment (Proof of Concept)
Device: cpu
Batch size: 8
Epochs: 2
Learning rate: 0.001

Loading RoBERTa model...
RoBERTa model loaded successfully

Created subset of 100 samples for testing

Creating character-level datasets...
Datasets created: 100 train, 100 val

Initializing WaveNet model...


Error during experiment: too many values to unpack (expected 3)

Experiment failed. Check error messages above.


Traceback (most recent call last):
  File "/tmp/ipykernel_283214/462461461.py", line 56, in run_character_level_experiment
    sample_features, _, sample_text = train_dataset[0]
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: too many values to unpack (expected 3)
