In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch
import evaluate

# SST2 Dataset

In [None]:
# Load dataset
dataset = load_dataset("glue", "sst2")

# Convert to text-to-text format
def preprocess(example):
    label_text = "positive" if example["label"] == 1 else "negative"
    return {
        "input_text": "sst2 sentence: " + example["sentence"],
        "target_text": label_text
    }

dataset = dataset.map(preprocess)

# Load tokenizer
model_name = "google/switch-base-8"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    model_inputs = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=4)
    model_inputs["label"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/872 [00:00<?, ? examples/s]



# QQP Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("glue", "qqp")

def preprocess(example):
    label_text = "duplicate" if example["label"] == 1 else "not duplicate"
    return {
        "input_text": f"qqp question1: {example['question1']} question2: {example['question2']}",
        "target_text": label_text
    }

dataset = dataset.map(preprocess)


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")

def tokenize(example):
    model_inputs = tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["target_text"], padding="max_length", truncation=True, max_length=4)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
label_map = {"not duplicate": 0, "duplicate": 1}
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = [p for p in preds if all(0 <= token < tokenizer.vocab_size for token in p)]
    labels = [l for l in labels if all(0 <= token < tokenizer.vocab_size for token in l)]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print("Decoded Predictions:", decoded_preds[:10])
    print("Decoded Labels:", decoded_labels[:10])


    decoded_preds = [p.strip().lower() for p in decoded_preds]
    decoded_labels = [l.strip().lower() for l in decoded_labels]

    label_map = {"not duplicate": 0, "duplicate": 1}
    mapped_preds = [label_map.get(p, -1) for p in decoded_preds]
    mapped_labels = [label_map.get(l, -1) for l in decoded_labels]

    filtered = [(p, l) for p, l in zip(mapped_preds, mapped_labels) if p != -1 and l != -1]

    if len(filtered) == 0:
        return {"accuracy": 0.0}  # or skip logging

    mapped_preds, mapped_labels = zip(*filtered)

    return metric.compute(predictions=mapped_preds, references=mapped_labels)



In [None]:
# Accuracy metric


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     decoded_preds = [p.strip().lower() for p in decoded_preds]
#     decoded_labels = [l.strip().lower() for l in decoded_labels]
#     return metric.compute(predictions=decoded_preds, references=decoded_labels)
import os
os.makedirs("/content/drive/MyDrive/sst2-switch", exist_ok=True)
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/sst2-switch",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=5,
    save_strategy="epoch",
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none",
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(30000)),  # use subset for faster training
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0827,0.060608,0.93922
2,0.0598,0.067243,0.934633
3,0.0425,0.064803,0.932339
4,0.0405,0.076288,0.930046
5,0.0283,0.086305,0.928899


Decoded Predictions: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Labels: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Predictions: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Labels: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Predictions: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Labels: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Predictions: ['positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative']
Decoded Labels: ['pos

TrainOutput(global_step=18750, training_loss=0.31740577829996747, metrics={'train_runtime': 4955.5525, 'train_samples_per_second': 30.269, 'train_steps_per_second': 3.784, 'total_flos': 6.85052854272e+16, 'train_loss': 0.31740577829996747, 'epoch': 5.0})

Epoch,Training Loss,Validation Loss
0,11.4489,0.087608


{'eval_loss': 0.08760783821344376}

In [None]:
model.save_pretrained("/content/drive/MyDrive/sst2-switch-model")
tokenizer.save_pretrained("/content/drive/MyDrive/sst2-switch-tokenizer")

('/content/drive/MyDrive/sst2-switch-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/sst2-switch-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/sst2-switch-tokenizer/spiece.model',
 '/content/drive/MyDrive/sst2-switch-tokenizer/added_tokens.json',
 '/content/drive/MyDrive/sst2-switch-tokenizer/tokenizer.json')