In [None]:
# Training function for WaveNet
def train_wavenet(model, train_loader, val_loader, criterion, optimizer, device, epochs=10):
    """
    Train WaveNet model.
    
    Args:
        model: WaveNet model
        train_loader: Training data loader
        val_loader: Validation data loader
        criterion: Loss function
        optimizer: Optimizer
        device: Device to train on
        epochs: Number of epochs
        
    Returns:
        train_losses: List of training losses
        val_losses: List of validation losses
    """
    model = model.to(device)
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        num_train_batches = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            features = batch['features'].to(device)  # [batch, 2, seq_len]
            targets = batch['targets'].to(device)    # [batch, 2, seq_len]
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(features)  # [batch, 2, seq_len]
            
            # Calculate loss (cross-entropy for each position)
            loss = criterion(outputs, targets)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            num_train_batches += 1
        
        avg_train_loss = total_train_loss / num_train_batches
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        num_val_batches = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                features = batch['features'].to(device)
                targets = batch['targets'].to(device)
                
                outputs = model(features)
                loss = criterion(outputs, targets)
                
                total_val_loss += loss.item()
                num_val_batches += 1
        
        avg_val_loss = total_val_loss / num_val_batches
        val_losses.append(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {avg_train_loss:.6f}, Val Loss = {avg_val_loss:.6f}")
    
    return train_losses, val_losses

# Custom loss function for character-level predictions
class CharacterLevelLoss(nn.Module):
    def __init__(self):
        super(CharacterLevelLoss, self).__init__()
        self.bce_loss = nn.BCELoss()
    
    def forward(self, predictions, targets):
        """
        Calculate loss for character-level predictions.
        
        Args:
            predictions: Model predictions [batch, 2, seq_len]
            targets: Ground truth [batch, 2, seq_len]
            
        Returns:
            loss: Combined loss for start and end predictions
        """
        # predictions already have softmax applied in model
        # For BCE loss, we need probabilities for the positive class
        # predictions shape: [batch, 2, seq_len] where dim 1 is [start_prob, end_prob]
        
        loss = self.bce_loss(predictions, targets)
        return loss

# Test training on a small subset
print("Testing WaveNet training...")

# Create small datasets for testing
train_subset = train_df.sample(n=50, random_state=42)
train_texts = train_subset['text'].tolist()
train_sentiments = train_subset['sentiment'].tolist()
train_selected_texts = train_subset['selected_text'].tolist()

# Split into train/val
train_size = int(0.8 * len(train_texts))
train_texts_split = train_texts[:train_size]
train_sentiments_split = train_sentiments[:train_size]
train_selected_texts_split = train_selected_texts[:train_size]

val_texts_split = train_texts[train_size:]
val_sentiments_split = train_sentiments[train_size:]
val_selected_texts_split = train_selected_texts[train_size:]

print(f"Train samples: {len(train_texts_split)}")
print(f"Val samples: {len(val_texts_split)}")

# Create datasets
train_dataset = CharacterLevelDataset(
    texts=train_texts_split,
    sentiments=train_sentiments_split,
    selected_texts=train_selected_texts_split,
    roberta_model=model,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

val_dataset = CharacterLevelDataset(
    texts=val_texts_split,
    sentiments=val_sentiments_split,
    selected_texts=val_selected_texts_split,
    roberta_model=model,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

# Initialize model, loss, optimizer
wavenet_model = CharacterWaveNet(
    input_channels=2,
    num_classes=2,
    num_blocks=2,  # Reduced for testing
    num_layers=4,   # Reduced for testing
    residual_channels=16,
    gate_channels=16,
    skip_channels=16
)

criterion = CharacterLevelLoss()
optimizer = torch.optim.Adam(wavenet_model.parameters(), lr=0.001)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Training on device: {device}")

# Train for a few epochs (reduced for testing)
train_losses, val_losses = train_wavenet(
    wavenet_model, train_loader, val_loader, criterion, optimizer, device, epochs=2
)

print(f"\nFinal train loss: {train_losses[-1]:.6f}")
print(f"Final val loss: {val_losses[-1]:.6f}")

In [None]:
# Full 5-fold CV training for Character-Level WaveNet
from sklearn.model_selection import KFold
import gc

def extract_predictions_from_wavenet(model, features, text, device='cuda'):
    """
    Extract refined character-level predictions from trained WaveNet.
    
    Args:
        model: Trained WaveNet model
        features: Character-level features [2, seq_len]
        text: Original text
        device: Device to run inference on
        
    Returns:
        refined_span: Refined selected_text prediction
    """
    model.eval()
    
    with torch.no_grad():
        # Add batch dimension
        features_tensor = features.unsqueeze(0).to(device)  # [1, 2, seq_len]
        
        # Get refined probabilities
        refined_probs = model(features_tensor)  # [1, 2, seq_len]
        refined_probs = refined_probs.squeeze(0).cpu().numpy()  # [2, seq_len]
        
        # refined_probs[0] = start probabilities, refined_probs[1] = end probabilities
        start_char = np.argmax(refined_probs[0])
        end_char = np.argmax(refined_probs[1])
        
        # Ensure valid range
        start_char = max(0, min(start_char, len(text) - 1))
        end_char = max(start_char, min(end_char, len(text) - 1))
        
        # Extract span
        refined_span = text[start_char:end_char + 1]
        
        return refined_span, refined_probs

# Set up 5-fold CV
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Prepare data
texts = train_df['text'].tolist()
sentiments = train_df['sentiment'].tolist()
selected_texts = train_df['selected_text'].tolist()

# Store results
fold_scores = []
all_predictions = []
all_targets = []

print(f"Starting 5-fold CV for Character-Level WaveNet...")
print(f"Total samples: {len(train_df)}")

# Create experiment directory
exp_dir = Path('/home/code/experiments/004_character_level_wavenet')
exp_dir.mkdir(parents=True, exist_ok=True)

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    print(f"\n{'='*60}")
    print(f"Fold {fold + 1}/{n_splits}")
    print(f"{'='*60}")
    
    # Split data
    train_texts_fold = [texts[i] for i in train_idx]
    train_sentiments_fold = [sentiments[i] for i in train_idx]
    train_selected_texts_fold = [selected_texts[i] for i in train_idx]
    
    val_texts_fold = [texts[i] for i in val_idx]
    val_sentiments_fold = [sentiments[i] for i in val_idx]
    val_selected_texts_fold = [selected_texts[i] for i in val_idx]
    
    print(f"Train samples: {len(train_texts_fold)}")
    print(f"Val samples: {len(val_texts_fold)}")
    
    # Load trained RoBERTa model for this fold
    roberta_model_path = f'/home/code/experiments/002_roberta_span/fold_{fold}_model.pt'
    if Path(roberta_model_path).exists():
        roberta_model = torch.load(roberta_model_path, map_location='cpu')
        roberta_model.eval()
        print(f"Loaded RoBERTa model from {roberta_model_path}")
    else:
        print(f"Warning: RoBERTa model not found at {roberta_model_path}")
        print("Using pretrained RoBERTa (performance will be suboptimal)")
        roberta_model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    roberta_model = roberta_model.to(device)
    
    # Create datasets
    train_dataset = CharacterLevelDataset(
        texts=train_texts_fold,
        sentiments=train_sentiments_fold,
        selected_texts=train_selected_texts_fold,
        roberta_model=roberta_model,
        tokenizer=tokenizer,
        device=device
    )
    
    val_dataset = CharacterLevelDataset(
        texts=val_texts_fold,
        sentiments=val_sentiments_fold,
        selected_texts=val_selected_texts_fold,
        roberta_model=roberta_model,
        tokenizer=tokenizer,
        device=device
    )
    
    # Create data loaders
    # Use smaller batch size due to memory constraints with character-level data
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)
    
    print(f"Train batches: {len(train_loader)}")
    print(f"Val batches: {len(val_loader)}")
    
    # Initialize WaveNet model
    # Use smaller architecture to fit in GPU memory
    wavenet_model = CharacterWaveNet(
        input_channels=2,
        num_classes=2,
        num_blocks=3,      # Reduced from 4
        num_layers=5,      # Reduced from 6
        residual_channels=64,
        gate_channels=64,
        skip_channels=64
    )
    
    # Loss and optimizer
    criterion = CharacterLevelLoss()
    optimizer = torch.optim.Adam(wavenet_model.parameters(), lr=0.001)
    
    # Train the model
    print(f"\nTraining WaveNet for fold {fold + 1}...")
    train_losses, val_losses = train_wavenet(
        wavenet_model, train_loader, val_loader, criterion, optimizer, device, epochs=10
    )
    
    # Save the trained model
    model_path = exp_dir / f'fold_{fold}_wavenet_model.pt'
    torch.save(wavenet_model, model_path)
    print(f"Saved WaveNet model to {model_path}")
    
    # Generate predictions on validation set
    print(f"\nGenerating predictions for fold {fold + 1}...")
    fold_predictions = []
    fold_targets = []
    
    wavenet_model.eval()
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Predicting fold {fold + 1}"):
            features = batch['features'].to(device)
            targets_batch = batch['targets']
            texts_batch = batch['text']
            selected_texts_batch = batch['selected_text']
            
            # Get refined predictions
            refined_probs = wavenet_model(features)  # [batch, 2, seq_len]
            refined_probs = refined_probs.cpu().numpy()
            
            # Extract spans for each sample in batch
            for i in range(len(texts_batch)):
                text = texts_batch[i]
                selected_text = selected_texts_batch[i]
                
                # Get start and end positions from refined probabilities
                start_char = np.argmax(refined_probs[i, 0, :len(text)])
                end_char = np.argmax(refined_probs[i, 1, :len(text)])
                
                # Ensure valid range
                start_char = max(0, min(start_char, len(text) - 1))
                end_char = max(start_char, min(end_char, len(text) - 1))
                
                # Extract span
                refined_span = text[start_char:end_char + 1]
                
                fold_predictions.append(refined_span)
                fold_targets.append(selected_text)
    
    # Calculate Jaccard score for this fold
    fold_score = np.mean([
        jaccard_similarity(pred, target)
        for pred, target in zip(fold_predictions, val_selected_texts_fold)
    ])
    
    print(f"Fold {fold + 1} Jaccard Score: {fold_score:.6f}")
    fold_scores.append(fold_score)
    
    # Store predictions
    all_predictions.extend(fold_predictions)
    all_targets.extend(fold_targets)
    
    # Clean up
    del roberta_model, wavenet_model, train_dataset, val_dataset
    del train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

# Calculate overall CV score
overall_score = np.mean(fold_scores)
print(f"\n{'='*60}")
print(f"Overall 5-Fold CV Score: {overall_score:.6f}")
print(f"Fold scores: {fold_scores}")
print(f"Std dev: {np.std(fold_scores):.6f}")
print(f"{'='*60}")

# Save CV results
cv_results = {
    'overall_score': overall_score,
    'fold_scores': fold_scores,
    'std_dev': np.std(fold_scores),
    'num_folds': n_splits
}

with open(exp_dir / 'cv_results.json', 'w') as f:
    json.dump(cv_results, f, indent=2)

print(f"Saved CV results to {exp_dir / 'cv_results.json'}")