In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/yelp-reviews-only-100k/Yelp reviews only l lac.csv


In [2]:
pip install transformers datasets sentencepiece scikit-learn


Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platfo

In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForMaskedLM
from datasets import Dataset
from tqdm import tqdm
import torch.nn.functional as F
from torch.optim import AdamW
import random

# 1. Load and prepare dataset
df = pd.read_csv("/kaggle/input/yelp-reviews-only-100k/Yelp reviews only l lac.csv", nrows=10).dropna(subset=['revew']).reset_index(drop=True)
dataset = Dataset.from_pandas(df[['revew']].rename(columns={'revew': 'text'}))

# 2. Tokenizer and Model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForMaskedLM.from_pretrained("roberta-base")

# 3. Collate function for joint MLM + SimCSE
def collate_fn(batch, mlm_probability=0.15):
    texts = [item["text"] for item in batch]
    # Tokenize for MLM
    encoding = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=64)

    # Create inputs for MLM
    inputs_mlm = encoding.input_ids.clone()
    labels_mlm = inputs_mlm.clone()

    # Randomly mask tokens
    probability_matrix = torch.full(labels_mlm.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels_mlm.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    inputs_mlm[masked_indices] = tokenizer.mask_token_id
    labels_mlm[~masked_indices] = -100  # only compute loss on masked tokens

    return {
        "input_ids": encoding.input_ids,
        "attention_mask": encoding.attention_mask,
        "inputs_mlm": inputs_mlm,
        "labels_mlm": labels_mlm,
    }

# 4. Contrastive Loss (SimCSE-style)
def simcse_loss(z1, z2, temperature=0.05):
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)
    sim = torch.matmul(z1, z2.T) / temperature
    labels = torch.arange(z1.size(0)).to(z1.device)
    return F.cross_entropy(sim, labels)

# 5. Prepare DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# 6. Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# 7. Training Loop
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        inputs_mlm = batch["inputs_mlm"].to(device)
        labels_mlm = batch["labels_mlm"].to(device)

        # ===== MLM Loss =====
        outputs = model(input_ids=inputs_mlm, attention_mask=attention_mask, labels=labels_mlm)
        mlm_loss = outputs.loss

        # ===== Contrastive Loss (SimCSE) =====
        with torch.no_grad():
            # Forward twice for SimCSE (no dropout needed for unsupervised)
            output1 = model.roberta(input_ids=input_ids, attention_mask=attention_mask)
            output2 = model.roberta(input_ids=input_ids, attention_mask=attention_mask)

            # Mean Pooling
            z1 = (output1.last_hidden_state * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
            z2 = (output2.last_hidden_state * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
                        
            contrast_loss = simcse_loss(z1, z2)

        # ===== Total Joint Loss =====
        loss = mlm_loss + contrast_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1} | Avg Loss: {total_loss / len(dataloader):.4f}")

# 8. Save the model
model.save_pretrained("./roberta-joint-mlm-contrastive")
tokenizer.save_pretrained("./roberta-joint-mlm-contrastive")

# 9. Zip model for download
import shutil
shutil.make_archive("/kaggle/working/roberta_model", 'zip', "./roberta-joint-mlm-contrastive")

# 10. Display download link
from IPython.display import FileLink
FileLink('/kaggle/working/roberta_model.zip')


2025-06-27 09:40:36.943079: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751017237.173465      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751017237.238758      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

100%|██████████| 1/1 [00:09<00:00,  9.12s/it]


Epoch 1 | Avg Loss: 5.0231


100%|██████████| 1/1 [00:07<00:00,  7.03s/it]


Epoch 2 | Avg Loss: 4.5157


In [4]:
import os
import shutil
from IPython.display import FileLink

# 1. Save model and tokenizer
model_dir = "/kaggle/working/roberta-joint-mlm-contrastive"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# 2. Ensure ZIP destination is clean
zip_path = "/kaggle/working/roberta_model.zip"
if os.path.exists(zip_path):
    os.remove(zip_path)

# 3. Zip the directory
shutil.make_archive("/kaggle/working/roberta_model", 'zip', model_dir)

# 4. Confirm it was created
if os.path.exists(zip_path):
    print("✅ Model zipped successfully!")
    display(FileLink(zip_path))
else:
    print("❌ Failed to create ZIP file.")


✅ Model zipped successfully!
