In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW

In [None]:
questions_answers = [
    ("What is the color of the sky?", "Blue"),
    ("What do bees make?", "Honey"),
        ("What is the largest planet?", "Jupiter"),
    ("Who wrote Hamlet?", "Shakespeare"),
    ("What liquid do cars need?", "Fuel"),
    ("What is frozen water called?", "Ice"),
    ("Which animal is known as man's best friend?", "Dog"),
    ("What do we breathe?", "Air"),
    ("What color is a ruby?", "Red"),
    ("What do bees produce?", "Honey"),
    ("What is the opposite of cold?", "Hot"),
    ("What do we call a baby cat?", "Kitten"),
    ("What do you use to write on a blackboard?", "Chalk"),
    ("What is the capital of France?", "Paris"),
    ("What fruit is known for its potassium?", "Banana"),
    ("What is the hardest natural substance?", "Diamond"),
    ("What season follows summer?", "Autumn"),
    ("What is the currency of the USA?", "Dollar"),
    ("What is the primary language in Spain?", "Spanish"),
    ("What is the color of grass?", "Green")
]

In [None]:
class QADataset(Dataset):
    def __init__(self, tokenizer, qa_list, max_length):
        self.tokenizer = tokenizer
        self.qa_list = qa_list
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_list)

    def __getitem__(self, idx):
        question, answer = self.qa_list[idx]
        encodings = self.tokenizer(f"{question} {answer}", truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        return encodings.input_ids[0], encodings.attention_mask[0]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
max_length = 32 # Define the maximum length for the sequences
dataset = QADataset(tokenizer, questions_answers, max_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
num_epochs = 48
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        inputs, masks = batch
        inputs, masks = inputs.to(device), masks.to(device)

        outputs = model(inputs, labels=inputs, attention_mask=masks)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 0, Loss: 0.12018436193466187
Epoch: 1, Loss: 0.11830762773752213
Epoch: 2, Loss: 0.13489176332950592
Epoch: 3, Loss: 0.13723264634609222
Epoch: 4, Loss: 0.11466779559850693
Epoch: 5, Loss: 0.12068860232830048
Epoch: 6, Loss: 0.13151094317436218
Epoch: 7, Loss: 0.10957656055688858
Epoch: 8, Loss: 0.13340261578559875
Epoch: 9, Loss: 0.15300284326076508
Epoch: 10, Loss: 0.09996376931667328
Epoch: 11, Loss: 0.13112474977970123
Epoch: 12, Loss: 0.14126864075660706
Epoch: 13, Loss: 0.11560693383216858
Epoch: 14, Loss: 0.1457289606332779
Epoch: 15, Loss: 0.11134045571088791
Epoch: 16, Loss: 0.11706754565238953
Epoch: 17, Loss: 0.1305571347475052
Epoch: 18, Loss: 0.10925156623125076
Epoch: 19, Loss: 0.17124491930007935
Epoch: 20, Loss: 0.11426228284835815
Epoch: 21, Loss: 0.11920469999313354
Epoch: 22, Loss: 0.1284492313861847
Epoch: 23, Loss: 0.13192419707775116
Epoch: 24, Loss: 0.11141557991504669
Epoch: 25, Loss: 0.14446093142032623
Epoch: 26, Loss: 0.1350884735584259
Epoch: 27, Loss

In [None]:
def generate_answer(question, model, tokenizer, max_length=50):
    # Tokenize the input question
    input_ids = tokenizer.encode(question, return_tensors='pt').to(device)

    # Generate the output (answer) using the model
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the output to a human-readable format
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    return answer

# Example usage
question = "what is the captial city of America?"
model.eval()  # Set the model to evaluation mode
answer = generate_answer(question, model, tokenizer)
print(answer)

what is the captial city of America? Chicago


In [None]:
question = "what is the color of blood?"
model.eval()  # Set the model to evaluation mode
answer = generate_answer(question, model, tokenizer)
print(answer)

what is the color of blood? Red
