# Tweet Sentiment Extraction - RoBERTa Span Model

This notebook implements a transformer-based approach using RoBERTa with span extraction head.

## Strategy
1. Use RoBERTa-base with start/end token classification
2. Frame as span prediction: predict start and end positions
3. Input: tweet text + sentiment token
4. Target: start/end indices of selected_text
5. Expected score: 0.70+ based on winning solutions

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from sklearn.model_selection import KFold
import re
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

2026-01-15 11:50:30.313353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-15 11:50:31.155555: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-15 11:50:31.278904: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
Memory: 85.1 GB


In [2]:
# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Clean text function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()
    # Remove surrounding quotes if present
    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]
    return text

# Clean text columns
train_df['text'] = train_df['text'].apply(clean_text)
train_df['selected_text'] = train_df['selected_text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

print("Sample cleaned data:")
print(train_df[['text', 'selected_text', 'sentiment']].head())

Train shape: (27481, 4)
Test shape: (3534, 3)
Sample cleaned data:
                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4  Sons of ****, why couldn`t they put them on th...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  


In [None]:
# Define Jaccard score function
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c)) if (len(a) + len(b) - len(c)) > 0 else 0.0

# Create a function to find start and end character positions
def find_start_end(text, selected_text):
    """Find start and end character positions of selected_text in text"""
    if pd.isna(selected_text) or selected_text == "":
        return 0, len(text)
    
    try:
        start = text.index(selected_text)
        end = start + len(selected_text)
        return start, end
    except:
        # Fallback: try to find approximate match
        words = selected_text.split()
        if len(words) > 0:
            for i in range(len(text) - len(words[0])):
                if text[i:i+len(words[0])] == words[0]:
                    return i, i + len(selected_text)
        return 0, len(text)

# Test the function
print("Testing start/end extraction:")
test_text = "I love this product, it's amazing!"
test_selected = "love this product"
start, end = find_start_end(test_text, test_selected)
print(f"Text: '{test_text}'")
print(f"Selected: '{test_selected}'")
print(f"Start: {start}, End: {end}")
print(f"Extracted: '{test_text[start:end]}'")

In [None]:
# Prepare training data with start/end positions
print("Preparing training data with start/end positions...")

train_data = []
for idx, row in train_df.iterrows():
    text = row['text']
    selected_text = row['selected_text']
    sentiment = row['sentiment']
    
    # Find start and end positions
    start, end = find_start_end(text, selected_text)
    
    # Create input text with sentiment token
    # Format: [SENTIMENT] text
    input_text = f"[{sentiment.upper()}] {text}"
    
    # Adjust start/end positions for the added sentiment token
    # Sentiment token adds len(sentiment) + 3 characters (including brackets and space)
    offset = len(sentiment) + 3
    start += offset
    end += offset
    
    train_data.append({
        'textID': row['textID'],
        'text': text,
        'input_text': input_text,
        'selected_text': selected_text,
        'sentiment': sentiment,
        'start': start,
        'end': end
    })

train_processed = pd.DataFrame(train_data)
print(f"Processed {len(train_processed)} training samples")
print("\nSample processed data:")
print(train_processed[['input_text', 'selected_text', 'start', 'end']].head())

In [None]:
# Create Dataset class
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            row['input_text'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
            return_offsets_mapping=True
        )
        
        # Get start and end token positions
        offset_mapping = encoding['offset_mapping'].squeeze().numpy()
        
        # Find token positions that correspond to character positions
        start_token = 0
        end_token = 0
        
        char_start = row['start']
        char_end = row['end']
        
        for i, (token_start, token_end) in enumerate(offset_mapping):
            if token_start <= char_start < token_end:
                start_token = i
            if token_start < char_end <= token_end:
                end_token = i
                break
        
        # If end not found, use last non-pad token
        if end_token == 0:
            end_token = (encoding['input_ids'].squeeze() != self.tokenizer.pad_token_id).sum().item() - 1
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'start_position': start_token,
            'end_position': end_token,
            'text': row['text'],
            'selected_text': row['selected_text'],
            'sentiment': row['sentiment']
        }

# Test the dataset
print("Testing dataset...")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
test_dataset = TweetDataset(train_processed.head(5), tokenizer)
test_item = test_dataset[0]
print(f"Input text: {test_item['text']}")
print(f"Selected text: {test_item['selected_text']}")
print(f"Start token: {test_item['start_position']}")
print(f"End token: {test_item['end_position']}")
print(f"Input shape: {test_item['input_ids'].shape}")

In [None]:
# Define the RoBERTa span model
class RoBERTaSpanExtractor(nn.Module):
    def __init__(self, model_name='roberta-base'):
        super(RoBERTaSpanExtractor, self).__init__()
        
        # Load pre-trained RoBERTa
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.hidden_size = self.roberta.config.hidden_size
        
        # Span extraction heads
        self.start_head = nn.Linear(self.hidden_size, 1)
        self.end_head = nn.Linear(self.hidden_size, 1)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids, attention_mask):
        # Get RoBERTa outputs
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        
        # Apply dropout
        sequence_output = self.dropout(sequence_output)
        
        # Predict start and end positions
        start_logits = self.start_head(sequence_output).squeeze(-1)
        end_logits = self.end_head(sequence_output).squeeze(-1)
        
        return start_logits, end_logits

# Test the model
print("Testing model...")
model = RoBERTaSpanExtractor()
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Test forward pass
test_input_ids = test_item['input_ids'].unsqueeze(0)
test_attention_mask = test_item['attention_mask'].unsqueeze(0)

with torch.no_grad():
    start_logits, end_logits = model(test_input_ids, test_attention_mask)
    print(f"Start logits shape: {start_logits.shape}")
    print(f"End logits shape: {end_logits.shape}")

In [None]:
# Training function
def train_model(model, train_loader, val_loader, device, epochs=3, lr=2e-5):
    model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    # Loss function
    def span_loss(start_logits, end_logits, start_positions, end_positions):
        loss_fct = nn.CrossEntropyLoss()
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        return (start_loss + end_loss) / 2
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_position'].to(device)
            end_positions = batch['end_position'].to(device)
            
            optimizer.zero_grad()
            
            start_logits, end_logits = model(input_ids, attention_mask)
            loss = span_loss(start_logits, end_positions, start_positions, end_positions)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_position'].to(device)
                end_positions = batch['end_position'].to(device)
                
                start_logits, end_logits = model(input_ids, attention_mask)
                loss = span_loss(start_logits, end_logits, start_positions, end_positions)
                
                val_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        
        print(f"Epoch {epoch+1}/{epochs} - Train loss: {avg_train_loss:.4f}, Val loss: {avg_val_loss:.4f}")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), '/home/code/experiments/002_roberta_span/best_model.pt')
    
    return model

# Prediction function
def predict_span(model, tokenizer, text, sentiment, device, max_length=128):
    model.eval()
    
    # Prepare input
    input_text = f"[{sentiment.upper()}] {text}"
    
    encoding = tokenizer(
        input_text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt',
        return_offsets_mapping=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    offset_mapping = encoding['offset_mapping'].squeeze().numpy()
    
    with torch.no_grad():
        start_logits, end_logits = model(input_ids, attention_mask)
        
        # Get predicted start and end positions
        start_pred = torch.argmax(start_logits, dim=1).item()
        end_pred = torch.argmax(end_logits, dim=1).item()
        
        # Ensure end >= start
        if end_pred < start_pred:
            end_pred = start_pred
        
        # Extract text span using offset mapping
        char_start = offset_mapping[start_pred][0]
        char_end = offset_mapping[end_pred][1]
        
        # Adjust for sentiment token offset
        offset = len(sentiment) + 3
        char_start = max(0, char_start - offset)
        char_end = max(0, char_end - offset)
        
        # Extract the predicted span
        predicted_span = text[char_start:char_end]
        
        # Handle edge cases
        if predicted_span.strip() == "":
            predicted_span = text
        
        return predicted_span

print("Functions defined successfully!")

In [None]:
# Cross-validation training
print("Starting 5-fold cross-validation...")

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
fold_predictions = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_processed)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/5")
    print(f"{'='*50}")
    
    # Split data
    train_fold = train_processed.iloc[train_idx]
    val_fold = train_processed.iloc[val_idx]
    
    # Create datasets
    train_dataset = TweetDataset(train_fold, tokenizer)
    val_dataset = TweetDataset(val_fold, tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    
    # Initialize model
    model = RoBERTaSpanExtractor()
    
    # Train model
    print(f"Training fold {fold + 1}...")
    model = train_model(model, train_loader, val_loader, device, epochs=3)
    
    # Load best model
    model.load_state_dict(torch.load('/home/code/experiments/002_roberta_span/best_model.pt'))
    
    # Evaluate on validation set
    print(f"Evaluating fold {fold + 1}...")
    fold_scores = []
    fold_preds = []
    
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            # Get predictions for each sample in batch
            for i in range(len(batch['text'])):
                text = batch['text'][i]
                sentiment = batch['sentiment'][i]
                true_selected = batch['selected_text'][i]
                
                # Predict
                pred_selected = predict_span(model, tokenizer, text, sentiment, device)
                
                # Calculate Jaccard score
                score = jaccard(true_selected, pred_selected)
                fold_scores.append(score)
                
                # Store predictions
                fold_preds.append({
                    'textID': batch['textID'][i] if 'textID' in batch else '',
                    'text': text,
                    'sentiment': sentiment,
                    'true_selected': true_selected,
                    'pred_selected': pred_selected,
                    'jaccard': score
                })
    
    fold_mean_score = np.mean(fold_scores)
    cv_scores.append(fold_mean_score)
    fold_predictions.extend(fold_preds)
    
    print(f"Fold {fold + 1} Jaccard score: {fold_mean_score:.4f}")

# Overall CV score
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)

print(f"\n{'='*50}")
print(f"CROSS-VALIDATION RESULTS")
print(f"{'='*50}")
print(f"CV Score: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")

In [None]:
# Save fold predictions for analysis
import json

fold_preds_df = pd.DataFrame(fold_predictions)
fold_preds_df.to_csv('/home/code/experiments/002_roberta_span/fold_predictions.csv', index=False)

# Show some sample predictions
print("\nSample predictions from validation:")
sample_preds = fold_preds_df.sample(5, random_state=42)
for idx, row in sample_preds.iterrows():
    print(f"\nSentiment: {row['sentiment']}")
    print(f"Text: '{row['text']}'")
    print(f"True: '{row['true_selected']}'")
    print(f"Pred: '{row['pred_selected']}'")
    print(f"Score: {row['jaccard']:.3f}")
    print("-" * 50)

In [None]:
# Train final model on full training data
print("Training final model on full training data...")

# Create full dataset
train_dataset_full = TweetDataset(train_processed, tokenizer)
train_loader_full = DataLoader(train_dataset_full, batch_size=16, shuffle=True)

# Initialize and train final model
final_model = RoBERTaSpanExtractor()
final_model = train_model(final_model, train_loader_full, train_loader_full, device, epochs=3)

# Save final model
torch.save(final_model.state_dict(), '/home/code/experiments/002_roberta_span/final_model.pt')
print("Final model saved!")

In [None]:
# Make predictions on test set
print("Making predictions on test set...")

test_predictions = []

for idx, row in test_df.iterrows():
    text = row['text']
    sentiment = row['sentiment']
    textID = row['textID']
    
    # Predict using the final model
    pred_selected = predict_span(final_model, tokenizer, text, sentiment, device)
    
    test_predictions.append({
        'textID': textID,
        'selected_text': pred_selected
    })
    
    if idx % 500 == 0:
        print(f"Processed {idx}/{len(test_df)} samples")

# Create submission
test_preds_df = pd.DataFrame(test_predictions)

# Ensure proper formatting (quoted text)
test_preds_df['selected_text'] = '"' + test_preds_df['selected_text'].astype(str) + '"'

print("\nSample test predictions:")
print(test_preds_df.head())

# Save submission
test_preds_df.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved to /home/submission/submission.csv")
print(f"Shape: {test_preds_df.shape}")

In [None]:
# Final results summary
print(f"{'='*50}")
print(f"ROBERTA SPAN MODEL RESULTS")
print(f"{'='*50}")
print(f"CV Score: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")
print(f"Baseline score: 0.5481")
print(f"Improvement: +{cv_mean - 0.5481:.4f}")
print(f"Test predictions: {len(test_preds_df)} samples")
print(f"Submission saved: /home/submission/submission.csv")

# Performance by sentiment
print(f"\nPerformance by sentiment (from validation):")
sentiment_perf = fold_preds_df.groupby('sentiment')['jaccard'].agg(['mean', 'count'])
print(sentiment_perf)