In [1]:
!pip install --upgrade datasets fsspec huggingface_hub

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.31.4-py3-none-any.whl.metadata (13 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.31.4-py3-none-any.whl (489 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.3/489.3 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, huggingface_hub, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Suc

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

print("Loading dataset and tokenizer...")

# Load IMDB dataset (or choose another text dataset)
dataset = load_dataset("imdb")

# Load tokenizer (BERT base uncased)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print("Dataset and tokenizer loaded successfully!")
print(f"Sample text: {dataset['train'][0]['text'][:200]}")  # show sample


Loading dataset and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Dataset and tokenizer loaded successfully!
Sample text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev


In [3]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

print("Tokenizing dataset...")

tokenized_dataset = dataset.map(tokenize, batched=True)

# Format dataset for PyTorch: we want input_ids, attention_mask, and label tensors
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

print("Tokenization done!")
print(f"Sample tokenized input_ids: {tokenized_dataset['train'][0]['input_ids'][:10]}")


Tokenizing dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenization done!
Sample tokenized input_ids: tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026])


In [4]:
from transformers import AutoModelForSequenceClassification
import torch

print("Loading model...")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

print(f"Model loaded on device: {device}")


Loading model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on device: cuda


In [5]:
from torch.utils.data import DataLoader

# Using small subsets to speed things up; you can increase later
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_dataset["test"].select(range(1000))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [8]:
from torch.amp import autocast, GradScaler  # ✅ updated import

scaler = GradScaler()  # ✅ no need to pass 'cuda', it auto-detects

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with autocast(device_type='cuda', dtype=torch.float16):  # ✅ updated syntax
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()


In [9]:
from tqdm.auto import tqdm

model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        total_loss += loss.item()
        loop.set_description(f"Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} Average Loss: {total_loss / len(train_loader):.4f}")


Epoch 1/3


  0%|          | 0/125 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Epoch 1 Average Loss: 0.0466
Epoch 2/3


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 2 Average Loss: 0.0138
Epoch 3/3


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 3 Average Loss: 0.0108


In [10]:
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"🎯 Test Accuracy: {accuracy:.4f}")


  0%|          | 0/63 [00:00<?, ?it/s]

🎯 Test Accuracy: 0.8570


In [11]:
import os
from transformers import BertTokenizer

save_path = "/content/text_embedder_model"
os.makedirs(save_path, exist_ok=True)

# 1. Save full PyTorch model weights
torch.save(model.state_dict(), os.path.join(save_path, "text_embedder_weights.pth"))

# 2. Save Hugging Face format (model + tokenizer)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("✅ Model and tokenizer saved to:", save_path)


✅ Model and tokenizer saved to: /content/text_embedder_model


In [12]:
import os

saved_files = os.listdir("/content/text_embedder_model")
print("Files saved:", saved_files)


Files saved: ['model.safetensors', 'vocab.txt', 'tokenizer_config.json', 'config.json', 'tokenizer.json', 'text_embedder_weights.pth', 'special_tokens_map.json']


In [13]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model from saved directory
tokenizer = AutoTokenizer.from_pretrained("/content/text_embedder_model")
model = AutoModel.from_pretrained("/content/text_embedder_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Example text to embed
texts = ["A beautiful sunset over the mountains.", "A futuristic city skyline at night."]

# Tokenize and encode
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

# Get embeddings from model's last hidden state
with torch.no_grad():
    outputs = model(**inputs)

# Usually, the embeddings can be the pooled output or mean of token embeddings
# For example, use pooled output if available (like in BERT):
if hasattr(outputs, "pooler_output"):
    embeddings = outputs.pooler_output
else:
    # Else take mean of token embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)

print("Embeddings shape:", embeddings.shape)


Embeddings shape: torch.Size([2, 768])


In [14]:
import shutil

# Zip the saved model folder
shutil.make_archive("/content/text_embedder_model", 'zip', "/content/text_embedder_model")


'/content/text_embedder_model.zip'

In [15]:
from google.colab import files

files.download("/content/text_embedder_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>