# Tweet Sentiment Extraction - RoBERTa Span Model (Fixed)

This notebook implements a transformer-based approach using RoBERTa with span extraction head.

## Strategy
1. Use RoBERTa-base with start/end token classification
2. Frame as span prediction: predict start and end positions
3. Input: tweet text + sentiment token
4. Target: start/end indices of selected_text
5. Expected score: 0.70+ based on winning solutions

## Key Fixes
- Use PreTrainedTokenizerFast for offset_mapping support
- Properly calculate token positions from character positions
- Complete data preprocessing pipeline

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaModel, RobertaConfig
from sklearn.model_selection import KFold
import re
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

2026-01-15 12:10:54.117945: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-15 12:10:54.143446: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-15 12:10:54.150947: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
Memory: 85.1 GB


In [2]:
# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Clean text function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    # Remove surrounding quotes if present
    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    return text

# Clean text columns
train_df['text'] = train_df['text'].apply(clean_text)
train_df['selected_text'] = train_df['selected_text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

print("Sample cleaned data:")
print(train_df[['text', 'selected_text', 'sentiment']].head())

Train shape: (27481, 4)
Test shape: (3534, 3)
Sample cleaned data:
                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4  Sons of ****, why couldn`t they put them on th...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  


In [3]:
# Add sentiment token and prepare data for span extraction
def prepare_data(df, is_train=True):
    """Prepare data with sentiment token and character positions"""
    processed = []
    
    for idx, row in df.iterrows():
        text = row['text']
        sentiment = row['sentiment']
        
        # Add sentiment token at the beginning (as in winning solution)
        input_text = f"{sentiment} {text}"
        
        if is_train:
            selected_text = row['selected_text']
            
            # Find character positions of selected_text in the full text
            # Note: We need to find it in the original text, not input_text
            try:
                start_char = text.index(selected_text)
                end_char = start_char + len(selected_text)
                
                # Adjust for sentiment token prefix
                # Sentiment token adds length(sentiment) + 1 (space)
                prefix_length = len(sentiment) + 1
                start_char += prefix_length
                end_char += prefix_length
                
            except ValueError:
                # If selected_text not found, use the whole text
                start_char = len(sentiment) + 1  # After sentiment token
                end_char = len(input_text)
                
            processed.append({
                'textID': row['textID'],
                'text': text,
                'sentiment': sentiment,
                'input_text': input_text,
                'selected_text': selected_text,
                'start_char': start_char,
                'end_char': end_char
            })
        else:
            processed.append({
                'textID': row['textID'],
                'text': text,
                'sentiment': sentiment,
                'input_text': input_text
            })
    
    return pd.DataFrame(processed)

# Prepare training data
train_processed = prepare_data(train_df, is_train=True)
test_processed = prepare_data(test_df, is_train=False)

print("Sample processed training data:")
print(train_processed[['input_text', 'selected_text', 'start_char', 'end_char']].head())

Sample processed training data:
                                          input_text  \
0        neutral I`d have responded, if I were going   
1  negative Sooo SAD I will miss you here in San ...   
2                 negative my boss is bullying me...   
3            negative what interview! leave me alone   
4  negative Sons of ****, why couldn`t they put t...   

                         selected_text  start_char  end_char  
0  I`d have responded, if I were going           8        43  
1                             Sooo SAD           9        17  
2                          bullying me          20        31  
3                       leave me alone          25        39  
4                        Sons of ****,           9        22  


In [None]:
# Create Dataset class with proper offset mapping
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Tokenize with offset mapping
        encoding = self.tokenizer(
            row['input_text'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
            return_offsets_mapping=True
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        offset_mapping = encoding['offset_mapping'].squeeze()
        
        # Find token positions from character positions
        start_char = row.get('start_char', 0)
        end_char = row.get('end_char', 0)
        
        start_token = 0
        end_token = 0
        
        # Find start token
        for i, (token_start, token_end) in enumerate(offset_mapping):
            if token_start <= start_char < token_end:
                start_token = i
                break
        
        # Find end token
        for i, (token_start, token_end) in enumerate(offset_mapping):
            if token_start < end_char <= token_end:
                end_token = i
                break
        
        # Ensure end_token >= start_token
        if end_token < start_token:
            end_token = start_token
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'start_position': torch.tensor(start_token, dtype=torch.long),
            'end_position': torch.tensor(end_token, dtype=torch.long),
            'textID': row['textID'],
            'text': row['text'],
            'sentiment': row['sentiment'],
            'selected_text': row.get('selected_text', ''),
            'input_text': row['input_text']
        }

In [None]:
# Define the RoBERTa span model
class RoBERTaSpanExtractor(nn.Module):
    def __init__(self, model_name='roberta-base'):
        super(RoBERTaSpanExtractor, self).__init__()
        
        # Load pre-trained RoBERTa
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.hidden_size = self.roberta.config.hidden_size
        
        # Span extraction heads
        self.start_head = nn.Linear(self.hidden_size, 1)
        self.end_head = nn.Linear(self.hidden_size, 1)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa outputs
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        
        # Apply dropout
        sequence_output = self.dropout(sequence_output)
        
        # Predict start and end positions
        start_logits = self.start_head(sequence_output).squeeze(-1)  # [batch_size, seq_len]
        end_logits = self.end_head(sequence_output).squeeze(-1)      # [batch_size, seq_len]
        
        return start_logits, end_logits

In [None]:
# Jaccard similarity function
def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity between two strings"""
    str1 = str(str1).lower()
    str2 = str(str2).lower()
    
    # Split into words
    words1 = set(str1.split())
    words2 = set(str2.split())
    
    if not words1 and not words2:
        return 1.0
    
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    
    return len(intersection) / len(union)

In [None]:
# Prediction function
def predict_span(model, tokenizer, text, sentiment, device, max_length=128):
    """Predict selected_text span for a given tweet"""
    model.eval()
    
    # Prepare input
    input_text = f"{sentiment} {text}"
    
    # Tokenize
    encoding = tokenizer(
        input_text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt',
        return_offsets_mapping=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    offset_mapping = encoding['offset_mapping'].squeeze().numpy()
    
    # Predict
    with torch.no_grad():
        start_logits, end_logits = model(input_ids, attention_mask)
        
        # Get start and end positions
        start_pred = torch.argmax(start_logits, dim=-1).item()
        end_pred = torch.argmax(end_logits, dim=-1).item()
        
        # Ensure end >= start
        if end_pred < start_pred:
            end_pred = start_pred
    
    # Extract text span from character positions
    start_char = offset_mapping[start_pred][0]
    end_char = offset_mapping[end_pred][1]
    
    # Extract the predicted text
    pred_text = input_text[start_char:end_char].strip()
    
    # If prediction is empty or just sentiment token, return full text
    if not pred_text or pred_text == sentiment or len(pred_text.split()) == 0:
        # For neutral sentiment, return full text (common pattern)
        if sentiment == 'neutral':
            return text
        # For positive/negative, try to extract something meaningful
        else:
            # Fallback: return the text without sentiment prefix
            if start_char > len(sentiment) + 1:
                return input_text[start_char:end_char].strip()
            else:
                # Return a reasonable default (first few words)
                words = text.split()
                return ' '.join(words[:5]) if words else text
    
    return pred_text

In [None]:
# Training function
def train_model(model, train_loader, val_loader, device, epochs=3, lr=2e-5):
    model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    # Loss function
    def span_loss(start_logits, end_logits, start_positions, end_positions):
        loss_fct = nn.CrossEntropyLoss()
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        return (start_loss + end_loss) / 2
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        train_steps = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_position'].to(device)
            end_positions = batch['end_position'].to(device)
            
            optimizer.zero_grad()
            
            start_logits, end_logits = model(input_ids, attention_mask)
            loss = span_loss(start_logits, end_logits, start_positions, end_positions)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_steps += 1
        
        # Validation
        model.eval()
        val_loss = 0
        val_steps = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_position'].to(device)
                end_positions = batch['end_position'].to(device)
                
                start_logits, end_logits = model(input_ids, attention_mask)
                loss = span_loss(start_logits, end_logits, start_positions, end_positions)
                
                val_loss += loss.item()
                val_steps += 1
        
        avg_train_loss = train_loss / train_steps if train_steps > 0 else 0
        avg_val_loss = val_loss / val_steps if val_steps > 0 else 0
        
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
    
    return model

In [None]:
# Cross-validation training
print("Starting 5-fold cross-validation...")

# Use RobertaTokenizerFast for offset_mapping support
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
fold_predictions = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_processed)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/5")
    print(f"{'='*50}")
    
    # Split data
    train_fold = train_processed.iloc[train_idx]
    val_fold = train_processed.iloc[val_idx]
    
    # Create datasets
    train_dataset = TweetDataset(train_fold, tokenizer)
    val_dataset = TweetDataset(val_fold, tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    
    # Initialize model
    model = RoBERTaSpanExtractor()
    
    # Train model
    model = train_model(model, train_loader, val_loader, device, epochs=3, lr=2e-5)
    
    # Evaluate on validation set
    model.eval()
    fold_scores = []
    
    with torch.no_grad():
        for batch in val_loader:
            # Get predictions
            texts = batch['text']
            sentiments = batch['sentiment']
            true_selected = batch['selected_text']
            
            for i in range(len(texts)):
                text = texts[i]
                sentiment = sentiments[i]
                true_text = true_selected[i]
                
                # Predict
                pred_text = predict_span(model, tokenizer, text, sentiment, device)
                
                # Calculate Jaccard similarity
                score = jaccard_similarity(true_text, pred_text)
                fold_scores.append(score)
                
                # Store for analysis
                fold_predictions.append({
                    'fold': fold + 1,
                    'textID': batch['textID'][i],
                    'text': text,
                    'sentiment': sentiment,
                    'true_selected': true_text,
                    'pred_selected': pred_text,
                    'jaccard': score
                })
    
    fold_score = np.mean(fold_scores)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} Score: {fold_score:.4f}")

cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)
print(f"\n{'='*50}")
print(f"CV Score: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")
print(f"{'='*50}")

In [None]:
# Save fold predictions for analysis
import os
os.makedirs('/home/code/experiments/002_roberta_span', exist_ok=True)

fold_preds_df = pd.DataFrame(fold_predictions)
fold_preds_df.to_csv('/home/code/experiments/002_roberta_span/fold_predictions.csv', index=False)

# Show some sample predictions
print("\nSample predictions from validation:")
sample_preds = fold_preds_df.sample(5, random_state=42)
for idx, row in sample_preds.iterrows():
    print(f"\nSentiment: {row['sentiment']}")
    print(f"Text: '{row['text']}'")
    print(f"True: '{row['true_selected']}'")
    print(f"Pred: '{row['pred_selected']}'")
    print(f"Score: {row['jaccard']:.3f}")
    print("-" * 50)

In [None]:
# Train final model on full training data
print("Training final model on full training data...")

# Create full dataset
train_dataset_full = TweetDataset(train_processed, tokenizer)
train_loader_full = DataLoader(train_dataset_full, batch_size=16, shuffle=True)

# Initialize and train final model
final_model = RoBERTaSpanExtractor()
final_model = train_model(final_model, train_loader_full, train_loader_full, device, epochs=3)

# Save final model
torch.save(final_model.state_dict(), '/home/code/experiments/002_roberta_span/final_model.pt')
print("Final model saved!")

In [None]:
# Make predictions on test set
print("Making predictions on test set...")

test_predictions = []

for idx, row in test_processed.iterrows():
    text = row['text']
    sentiment = row['sentiment']
    textID = row['textID']
    
    # Predict using the final model
    pred_selected = predict_span(final_model, tokenizer, text, sentiment, device)
    
    test_predictions.append({
        'textID': textID,
        'selected_text': pred_selected
    })
    
    if idx % 500 == 0:
        print(f"Processed {idx}/{len(test_processed)} samples")

# Create submission
test_preds_df = pd.DataFrame(test_predictions)

print("\nSample test predictions:")
print(test_preds_df.head())

# Save submission
test_preds_df.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Shape: {test_preds_df.shape}")

In [None]:
# Final results summary
print(f"{'='*50}")
print(f"ROBERTA SPAN MODEL RESULTS")
print(f"{'='*50}")
print(f"CV Score: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")
print(f"Baseline score: 0.5481")
print(f"Improvement: +{cv_mean - 0.5481:.4f}")
print(f"Test predictions: {len(test_preds_df)} samples")
print(f"Submission saved: /home/submission/submission.csv")

# Performance by sentiment
print(f"\nPerformance by sentiment (from validation):")
sentiment_perf = fold_preds_df.groupby('sentiment')['jaccard'].agg(['mean', 'count'])
print(sentiment_perf)