In [14]:
!nvidia-smi

Tue Jan 16 19:19:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P0              32W /  70W |    657MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install transformers



In [7]:
import json
import os
import glob

import random
import numpy as np
import pandas as pd

from tqdm import tqdm

from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoModelWithLMHead, AutoTokenizer, GPT2LMHeadModel, GPT2Config
from transformers import AdamW

from torch.nn.functional import softmax
from nltk.translate.bleu_score import corpus_bleu

from IPython import display


# (a) Process the Data

In [3]:
import re
from sklearn.model_selection import train_test_split

# Assuming you have a file containing the poems
poems_path = 'ferdousi.txt'

# Read and preprocess the data
with open(poems_path, 'r', encoding='utf-8') as file:
    poems_text = file.read()

# Split the poems into verses
verses = re.split('\n', poems_text)
verses = [verse.strip() for verse in verses if verse.strip()]

# Create input-output pairs (line pairs)
pairs = [(verses[i], verses[i+1], verses[i+2], verses[i+3]) for i in range(0, len(verses)-3, 2)]

# Split the data into training and test sets
train_pairs, test_pairs = train_test_split(pairs, test_size=0.1, random_state=42)


In [4]:
print(len(train_pairs))
print(len(test_pairs))

20660
2296


# (b) Build Data Loaders

In [5]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Use the pre-trained GPT2 tokenizer
model_name_or_path = "HooshvareLab/gpt2-fa"
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, bos_token='', eos_token='', pad_token='')

class PoetryDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=256):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        input_verse1, input_verse2, target_verse1, target_verse2 = self.pairs[idx]

        # Tokenize input and target verses
        input_encoding = self.tokenizer(input_verse1 + " " + input_verse2, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        target_encoding = self.tokenizer(target_verse1 + " " + target_verse2, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask_input': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze(),
            'attention_mask_target': target_encoding['attention_mask'].squeeze()
        }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Create data loaders
train_dataset = PoetryDataset(train_pairs, tokenizer)
test_dataset = PoetryDataset(test_pairs, tokenizer)

batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# (c) Load Pre-trained Model and Fine-tune

In [7]:
from transformers import GPT2LMHeadModel, GPT2Config, AdamW

# Load pre-trained GPT2 model
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
model.resize_token_embeddings(len(tokenizer))


Embedding(42001, 768)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(42001, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=42001, bias=False)
)

In [9]:
# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask_input = batch['attention_mask_input'].to(device)
        labels = batch['labels'].to(device)
        attention_mask_target = batch['attention_mask_target'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask_input, labels=labels, attention_mask_target=attention_mask_target)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")

# Evaluate the model on the test set
model.eval()
test_loss = 0

for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask_input = batch['attention_mask_input'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask_input, labels=labels)
        test_loss += outputs.loss.item()

average_test_loss = test_loss / len(test_loader)
print(f"Average Test Loss: {average_test_loss}")


# Save the fine-tuned model if needed
model.save_pretrained("fine_tuned_model")




Epoch 1/1, Average Loss: 0.44028428139217685
Average Test Loss: 0.3559824666495107


# (d) Generate Sentences and Evaluate on Test Set

In [14]:
from nltk.translate.bleu_score import corpus_bleu

def generate_sentence(model, tokenizer, input_sentence, max_length=256, temperature=1.0):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer.encode(input_sentence, return_tensors="pt").to(device)

        # Attention mask for generation
        attention_mask = torch.ones_like(input_ids).to(device)

        # Generate sentence
        output_ids = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            attention_mask=attention_mask
        )

        # Filter out input tokens from generated tokens
        generated_sentence = tokenizer.decode(output_ids[0, input_ids.shape[1]:], skip_special_tokens=True)

    return generated_sentence


# Evaluate on the test set using BLEU score
references = [test_pair[2] + " " + test_pair[3] for test_pair in test_pairs]  # Target verses in the test set

generated_sentences = []
for test_pair in test_pairs:
    input_verse1, input_verse2, target_verse1, target_verse2 = test_pair
    input_sentence = input_verse1 + " " + input_verse2
    target_sentence = target_verse1 + " " + target_verse2
    print("-------------------------new test-----------------------")
    print(f"Input Sentence: {input_sentence}")
    generated_sentence = generate_sentence(model, tokenizer, input_sentence)
    print(f"Generated Sentence: {generated_sentence}")
    print(f"Target Sentence: {target_sentence}")
    generated_sentences.append(generated_sentence)

bleu_score = corpus_bleu([[ref.split()] for ref in references], [gen.split() for gen in generated_sentences])
print(f"BLEU Score on Test Set: {bleu_score}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Generated Sentence: ستاره بدان دشت نظاره بود که این لشکر از جنگ بیچاره بود
Target Sentence: بگشتیم گرد دژ ایدر بسی ندیدیم جز کینه درمان کسی
-------------------------new test-----------------------
Input Sentence: غمی گشت رستم ز گفتار اوی بر شاه کاووس بنهاد روی
Generated Sentence: غمی گشت رستم ز گفتار اوی بر شاه کاووس بنهاد روی
Target Sentence: چو کاووس کی پهلوان را بدید بر خویش نزدیک جایش گزید
-------------------------new test-----------------------
Input Sentence: همی گفت زار این جهانبین من سوار سرافراز رویین من
Generated Sentence: همی گفت زار این جهانبین من سوار سرافراز رویین من
Target Sentence: جهانجوی لهاک و فرشیدورد سواران و گردان روز نبرد
-------------------------new test-----------------------
Input Sentence: همه رزمگه دخمه ها ساختند ازان کشتگان چو بپرداختند
Generated Sentence: همه رزمگه دخمه ها ساختند ازان کشتگان چو بپرداختند
Target Sentence: ز چیزی که بود اندران رزمگاه ببخشید شاه جهان بر سپاه
--------------------

KeyboardInterrupt: 


# Since I didn't get the desirable output, I tried the LSTM model as well. This work is attached as Q3_HW4_part2.