<a href="https://colab.research.google.com/github/T-K-O-H/youtube_to_linkedin/blob/main/full_model_finetuning_wandb_cosine_amp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🔧 Full Model Fine-Tuning with W&B, Cosine LR, and AMP
This notebook fine-tunes a Sentence Transformer model to improve query-to-transcript chunk retrieval using `MultipleNegativesRankingLoss`. Includes logging with Weights & Biases and uses mixed precision.

In [None]:
# ✅ Install dependencies
!pip install sentence-transformers wandb huggingface-hub datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0-

In [None]:
# 📚 Imports
import json
import wandb
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader

In [None]:
train_samples = []
with open("/content/data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # skip blank lines
        try:
            item = json.loads(line)
            query = item["query"]
            chunk = item["chunk"]
            train_samples.append(InputExample(texts=[query, chunk]))
        except json.JSONDecodeError:
            print("Skipping invalid JSON line:", line)

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
word_embedding_model = models.Transformer(model_name, max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# 🔄 DataLoader and loss function
from sentence_transformers.losses import MultipleNegativesRankingLoss, MatryoshkaLoss

dataloader_1 = DataLoader(train_samples, batch_size=8, shuffle=True, drop_last=True)
dataloader_2 = DataLoader(train_samples, batch_size=8, shuffle=True, drop_last=True)


loss_1 = MultipleNegativesRankingLoss(model)

loss_2 = MatryoshkaLoss(
    model=model,
    loss=MultipleNegativesRankingLoss(model),  # nested loss
    matryoshka_dims=[768, 512, 256, 128]
)

In [None]:
# 🧪 Initialize W&B for tracking
wandb.login()
wandb.init(project="transcript-retrieval", name="full-model-finetuned", mode="online")



In [None]:
# --- Setup ---
import wandb
import json
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.losses import MultipleNegativesRankingLoss, MatryoshkaLoss
from torch.utils.data import DataLoader
from torch.optim import AdamW

# --- WandB Init ---
wandb.login(relogin=True)
wandb.init(
    project="mpnet-finetune",
    name="matryoshka-mnr-combined",
    config={
        "epochs": 5,
        "lr": 2e-5,
        "scheduler": "warmupcosine",
        "batch_size": 32,
        "loss": "Matryoshka + MultipleNegatives",
        "base_model": "all-mpnet-base-v2"
    }
)

# --- Simple WandB Logger ---
class WandbLogger:
    def __init__(self, optimizer=None, evaluator=None):
        self.optimizer = optimizer
        self.evaluator = evaluator

    def __call__(self, score, epoch, steps):
        log_data = {
            "epoch": epoch,
            "loss": score,
            "steps": steps
        }

        if self.optimizer:
            log_data["learning_rate"] = self.optimizer.param_groups[0]['lr']

        if self.evaluator:
            val_score = self.evaluator()
            log_data["val_score"] = val_score

        wandb.log(log_data)

# --- Load training data ---
with open("/content/data.jsonl", "r", encoding="utf-8") as f:
    train_samples = [json.loads(line) for line in f]

samples = [InputExample(texts=[s["query"], s["chunk"]]) for s in train_samples]
dataloader_1 = DataLoader(samples, shuffle=True, batch_size=32, drop_last=True)
dataloader_2 = DataLoader(samples, shuffle=True, batch_size=32, drop_last=True)

# --- Load model ---
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# --- Create optimizer ---
optimizer = AdamW(model.parameters(), lr=2e-5)

# --- Define losses ---
loss_1 = MultipleNegativesRankingLoss(model)
loss_2 = MatryoshkaLoss(
    model=model,
    loss=MultipleNegativesRankingLoss(model),
    matryoshka_dims=[768, 512, 256, 128]
)

# --- Train the model ---
model.fit(
    train_objectives=[(dataloader_1, loss_1), (dataloader_2, loss_2)],
    epochs=5,
    warmup_steps=100,
    optimizer_params={"lr": 2e-5},
    scheduler="warmupcosine",
    show_progress_bar=True,
    use_amp=True,
    output_path="finetuned_mpnet_matryoshka_mnr",
    callback=WandbLogger(optimizer=optimizer)
)




0,1
train/epoch,▁█
train/global_step,▁█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
train/epoch,0.3858
train/global_step,1000.0
train/grad_norm,0.6156
train/learning_rate,2e-05
train/loss,0.0375


Computing widget examples:   0%|          | 0/2 [00:00<?, ?example/s]

Step,Training Loss
500,0.0878
1000,0.0375
1500,0.029
2000,0.0238
2500,0.0143
3000,0.0086
3500,0.0063
4000,0.0068
4500,0.005
5000,0.0049


In [None]:
from getpass import getpass
from huggingface_hub import login

hf_token = getpass("Enter your Hugging Face token: ")
login(token=hf_token)

Enter your Hugging Face token: ··········


In [None]:
from huggingface_hub import HfApi, HfFolder
from sentence_transformers import SentenceTransformer

# Load the fine-tuned model
model = SentenceTransformer("finetuned_mpnet_matryoshka_mnr")

# Push to Hugging Face (change user/model_id)
model.push_to_hub("Shipmaster1/finetuned_mpnet_matryoshka_mnr")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/Shipmaster1/finetuned_mpnet_matryoshka_mnr/commit/f50d67da79d8d90ba243442d0989144f79ea4408'