In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset

# Step 1: Load the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")  # use only 1% for simplicity

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [3]:
# Step 2: Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding="max_length", max_length=50)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/367 [00:00<?, ? examples/s]

In [4]:
# Step 3: Define Dataset and DataLoader for Language Modeling
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_datasets, seq_len=5):
        self.seq_len = seq_len
        self.input_ids = tokenized_datasets['input_ids']

    def __len__(self):
        return sum(len(line) - self.seq_len for line in self.input_ids)

    def __getitem__(self, idx):
        line = self.input_ids[idx // (len(self.input_ids) - self.seq_len)]
        x = line[idx % (len(line) - self.seq_len): idx % (len(line) - self.seq_len) + self.seq_len]
        y = line[idx % (len(line) - self.seq_len) + self.seq_len]
        return x, y

dataset = TextDataset(tokenized_datasets)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [5]:
# Step 4: Define the Simple Transformer model
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size=32, num_heads=2, num_layers=2, max_seq_len=50):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.position_encoding = nn.Parameter(torch.randn(max_seq_len, embed_size))  # Max sequence length
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads), num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)  # Get the current sequence length
        position_encoding = self.position_encoding[:seq_len, :].unsqueeze(0)  # Adjust position encoding dynamically
        x = self.embedding(x) + position_encoding  # Add position encoding to embeddings
        x = self.transformer(x.transpose(0, 1)).transpose(0, 1)  # Transpose for transformer
        x = self.fc(x[:, -1])  # Predict the next word (last position)
        return x

# Model, optimizer, and loss function
model = SimpleTransformer(vocab_size=tokenizer.vocab_size)
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()



In [6]:
# Step 5: Train the Model
for epoch in range(5):  # Fewer epochs for demonstration
    total_loss = 0
    for x, y in data_loader:
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(data_loader):.4f}")

Epoch 1, Loss: 2.2311
Epoch 2, Loss: 1.3231
Epoch 3, Loss: 0.7935
Epoch 4, Loss: 0.4833
Epoch 5, Loss: 0.3471


In [7]:
def generate_text(model, tokenizer, start_text, max_len=10):
    model.eval()

    # Tokenize the input text
    tokens = tokenizer.encode(start_text, return_tensors='pt')

    generated = tokens.clone()

    for _ in range(max_len):
        with torch.no_grad():
            output = model(generated)
            next_token = torch.argmax(output, dim=-1)[-1].unsqueeze(0)  # Get the last predicted token
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)  # Append the new token

    return tokenizer.decode(generated.squeeze().tolist())

# Example usage
print("Generated Text:", generate_text(model, tokenizer, "the quick brown"))

Generated Text: [CLS] the quick brown [SEP] [PAD] [PAD]kyria and military unit ii and improvements
