# SBERT Fine-tuning for Social Media Retrieval
Colab-optimized version

In [None]:
# 1. Setup Environment
!pip install -q sentence-transformers torch faiss-gpu
from google.colab import drive
drive.mount('/content/drive')

# Set project paths
import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
os.chdir(PROJECT_PATH)

# GPU monitoring
!nvidia-smi

In [None]:
# 2. Data Preparation
from src.data_preparation import DataPreprocessor
from pathlib import Path
import json

# Initialize preprocessor
preprocessor = DataPreprocessor()
combined_path = Path("data/processed/combined.json")

# Process data if needed
if not combined_path.exists():
    print("Processing datasets...")
    try:
        # Load MSMARCO from HuggingFace
        from datasets import load_dataset
        msmarco = load_dataset("microsoft/ms_marco", "v1.1")
        msmarco_df = pd.DataFrame([{
            'text': doc['passages']['passage_text'][0]
        } for doc in msmarco['train'] if 'passages' in doc])
        
        # Process and combine datasets
        preprocessor.process_msmarco(msmarco_df)
        preprocessor.process_twitter(Path("data/raw/twitter.zip"))
    except Exception as e:
        print(f"Error processing data: {e}")
        raise

# Load training data
with open(combined_path) as f:
    train_data = json.load(f)['train']
    
print(f"Loaded {len(train_data)} training examples")

In [None]:
# 3. Model Training (adapted from src/model_training.py)
import torch
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

# Configuration
BASE_MODEL = "all-mpnet-base-v2"
BATCH_SIZE = 64 if torch.cuda.is_available() else 16
EPOCHS = 3

# Initialize model
model = SentenceTransformer(BASE_MODEL)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Training setup
from sentence_transformers import InputExample
train_examples = [
    InputExample(texts=[text, text], label=1.0) 
    for text in train_data[:10000]  # Limit for demo
]
train_dataloader = DataLoader(train_examples, batch_size=BATCH_SIZE)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Training configuration
warmup_steps = 100
optimizer_params = {'lr': 2e-5}

# Training loop with full configuration
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    optimizer_params=optimizer_params,
    output_path="models/sbert_model",
    checkpoint_path="models/checkpoints",
    checkpoint_save_steps=1000,
    save_best_model=True,
    show_progress_bar=True,
    use_amp=True
)

print("\nTraining completed successfully!")

In [None]:
# 4. Fallback Model Setup
if torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated() > 0.8:
    print("Switching to RoBERTa-base for memory efficiency")
    model = SentenceTransformer("roberta-base")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,  # Shorter training for fallback
        output_path="models/fallback"
    )

In [None]:
# 5. Validation
from sentence_transformers import evaluation

# Load test data
with open("data/processed/combined.json") as f:
    test_data = json.load(f)["test"]

# Create evaluator
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=test_data[:1000],
    sentences2=test_data[:1000],
    scores=[1.0]*1000  # Perfect similarity for demo
)

evaluator(model)