<a href="https://colab.research.google.com/github/Rongxuan-Zhou/CS6120_project/blob/main/notebooks/2_sbert_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# 1. Setup Environment
!pip install -q sentence-transformers torch faiss-gpu
!pip install -q datasets pandas
from google.colab import drive
drive.mount('/content/drive')

# Set project paths
import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
os.chdir(PROJECT_PATH)

# GPU monitoring
!nvidia-smi

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec

In [8]:
# 2. Data Preparation
import pandas as pd
from src.data_preparation import DataPreprocessor
from pathlib import Path
import json
from sklearn.model_selection import train_test_split

# Initialize preprocessor
preprocessor = DataPreprocessor()
data_dir = Path("data/processed")
data_dir.mkdir(parents=True, exist_ok=True)
combined_path = data_dir / "combined.json"

# Process data if needed
if not combined_path.exists():
    print("Processing datasets...")
    try:
        # Load MSMARCO from HuggingFace
        from datasets import load_dataset
        print("Loading MSMARCO dataset...")
        msmarco = load_dataset("microsoft/ms_marco", "v1.1")

        # Extract text passages
        msmarco_texts = []
        for doc in msmarco['train']:
            if 'passages' in doc and 'passage_text' in doc['passages'] and len(doc['passages']['passage_text']) > 0:
                msmarco_texts.append(doc['passages']['passage_text'][0])

        print(f"Extracted {len(msmarco_texts)} MSMARCO passages")

        # For demo purposes, use a smaller subset
        msmarco_texts = msmarco_texts[:10000]

        # Try to load Twitter data if available, otherwise create dummy data
        try:
            twitter_path = Path("data/twitter_processed.csv")
            if twitter_path.exists():
                twitter_df = pd.read_csv(twitter_path)
                twitter_texts = twitter_df['text'].tolist()
                print(f"Loaded {len(twitter_texts)} Twitter texts")
            else:
                # Create dummy Twitter data
                twitter_texts = [f"This is a sample tweet {i}" for i in range(1000)]
                print("Created dummy Twitter data")
        except Exception as e:
            print(f"Error loading Twitter data: {e}")
            twitter_texts = [f"This is a sample tweet {i}" for i in range(1000)]

        # Combine datasets
        all_texts = msmarco_texts + twitter_texts

        # Split into train/val/test
        train_texts, temp_texts = train_test_split(all_texts, test_size=0.2, random_state=42)
        val_texts, test_texts = train_test_split(temp_texts, test_size=0.5, random_state=42)

        # Create combined dataset
        combined_data = {
            "train": train_texts,
            "val": val_texts,
            "test": test_texts
        }

        # Save to JSON
        with open(combined_path, 'w') as f:
            json.dump(combined_data, f)

        print(f"Combined dataset saved to {combined_path}")

    except Exception as e:
        print(f"Error processing data: {e}")
        # Create a minimal dataset for testing
        dummy_data = {
            "train": ["Sample training text " + str(i) for i in range(1000)],
            "val": ["Sample validation text " + str(i) for i in range(100)],
            "test": ["Sample test text " + str(i) for i in range(100)]
        }
        with open(combined_path, 'w') as f:
            json.dump(dummy_data, f)
        print("Created minimal dummy dataset for testing")

# Load training data
with open(combined_path) as f:
    data = json.load(f)
    train_data = data['train']

print(f"Loaded {len(train_data)} training examples")

Processing datasets...
Loading MSMARCO dataset...
Extracted 82326 MSMARCO passages
Created dummy Twitter data
Combined dataset saved to data/processed/combined.json
Loaded 8800 training examples


In [9]:
# 3. Model Training (adapted from src/model_training.py)
import torch
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

# Configuration
BASE_MODEL = "all-mpnet-base-v2"
BATCH_SIZE = 64 if torch.cuda.is_available() else 16
EPOCHS = 3

# Initialize model
model = SentenceTransformer(BASE_MODEL)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Training setup
from sentence_transformers import InputExample
train_examples = [
    InputExample(texts=[text, text], label=1.0)
    for text in train_data[:10000]  # Limit for demo
]
train_dataloader = DataLoader(train_examples, batch_size=BATCH_SIZE)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Training configuration
warmup_steps = 100
optimizer_params = {'lr': 2e-5}

# Training loop with full configuration
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    optimizer_params=optimizer_params,
    output_path="models/sbert_model",
    checkpoint_path="models/checkpoints",
    checkpoint_save_steps=1000,
    save_best_model=True,
    show_progress_bar=True,
    use_amp=True
)

print("\nTraining completed successfully!")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mzhou-rongx[0m ([33mzhou-rongx-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss



Training completed successfully!


In [10]:
# 4. Fallback Model Setup
if torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated() > 0.8:
    print("Switching to RoBERTa-base for memory efficiency")
    model = SentenceTransformer("roberta-base")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,  # Shorter training for fallback
        output_path="models/fallback"
    )

In [11]:
# 5. Validation
from sentence_transformers import evaluation

# Load test data
with open("data/processed/combined.json") as f:
    test_data = json.load(f)["test"]

# Create evaluator
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=test_data[:1000],
    sentences2=test_data[:1000],
    scores=[1.0]*1000  # Perfect similarity for demo
)

evaluator(model)

  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)


{'pearson_cosine': np.float64(nan), 'spearman_cosine': nan}