# Part 2 Pytorch-Transformer (Text generation) with GPT2 Pre-trained model:

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from datasets import Dataset as HFDataset
import logging
logging.getLogger().setLevel(logging.CRITICAL)
import warnings
import psutil
import ipywidgets as widgets
from IPython.display import display
warnings.filterwarnings('ignore')

2024-05-26 00:15:11.123570: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 00:15:11.123610: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 00:15:11.124673: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-26 00:15:11.131711: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Loading only the column of description from that data

I chose the dataset of Netflix Descriptions, the data has over 8800 lines, but we only trained with 1000 lines as my machine doesn't have the appropriate resources and hardware to train with a large dataset, and have effective results, trading epochs and time for amount of data in this case

In [2]:
dataset_path = 'netflix_titles.csv'
descriptions = pd.read_csv(dataset_path)['description'][:1000]

In [3]:
descriptions

0      As her father nears the end of his life, filmm...
1      After crossing paths at a party, a Cape Town t...
2      To protect his family from a powerful drug lor...
3      Feuds, flirtations and toilet talk go down amo...
4      In a city of coaching centers known to train I...
                             ...                        
995    In 1974, a rural town in Anatolia gets its fir...
996    Truth and illusion blurs when a homeless amnes...
997    Using innovative technology, this docuseries e...
998    Journalists and fans await Ma Anand Sheela as ...
999    A three-person crew on a mission to Mars faces...
Name: description, Length: 1000, dtype: object

### We use the gpu if available but in my case i have AMD which doesn't support ML and DL training and isn't compatible as requires NVIDEA

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

# Loading GPT2-medium which we will be using and the tokenizer from pytorch transformers

In [5]:
model_name = 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 1024)

### Defining max length of generated text to be the same as max in our data

In [6]:
# Defining the max length of generated description by looping through all descriptions in data and registering mac length
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [7]:
# Define a custom dataset class for Netflix descriptions
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        for txt in txt_list:
            # Tokenize each description and store input IDs and attention masks
            encodings_dict = tokenizer('' + txt + '', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.input_ids)

    def __getitem__(self, idx):
        # Return a specific sample from the dataset
        return self.input_ids[idx], self.attn_masks[idx]

# Create an instance of the NetflixDataset class
dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)

In [8]:
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [9]:
# Determine available memory
available_memory = psutil.virtual_memory().available / (1024 ** 3)  # Convert bytes to GB

# Calculate a reasonable batch size (this is an estimation, adjust based on actual usage)
# Here, we use a conservative estimate of 1GB per batch element as a starting point
# Adjust the memory per element based on your specific model and input size
memory_per_element = 1.0  # GB
initial_batch_size = int(available_memory // memory_per_element // 2)  # Divide by 2 for safety margin

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 8,
    learning_rate = 1e-4,
    logging_steps = 10,
    per_device_train_batch_size =5,
    load_best_model_at_end = False,
    per_device_eval_batch_size=initial_batch_size,
    logging_dir ='./logs',
    report_to='none'
)

In [10]:
# Create a Trainer object and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([f[0] for f in data]),
        'attention_mask': torch.stack([f[1] for f in data]),
        'labels': torch.stack([f[0] for f in data])
    }
)

# trainer.train()
# model.save_pretrained('./saved_model')
# tokenizer.save_pretrained('./saved_model')

We commented the previous lines because the models was already trained and saved and it took 24hours to train

In [11]:
# Function to choose a token from the top n probabilities
import numpy as np
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # normalize
    choice = np.random.choice(n, 1, p=top_prob)
    token_id = ind[choice][0]
    return int(token_id)

## Loading already trained and saved model beforehand to evaluate and test

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('./saved_model')
model = GPT2LMHeadModel.from_pretrained('./saved_model').to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Function of text generation with saved and trained model

In [13]:
def generate_text(prompt, max_length=80):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        do_sample=True,
        top_k=70,
        temperature=3.0,
        num_beams=5,
        attention_mask=inputs.attention_mask,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Interface to test model

In [14]:
# Define the function to handle the button click
def on_generate_button_clicked(b):
    prompt = prompt_textbox.value
    generated_text = generate_text(prompt)
    output_textbox.value = generated_text

# Create the widgets
prompt_textbox = widgets.Textarea(
    value='',
    placeholder='Enter your prompt here...',
    description='Prompt:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='100px')
)

generate_button = widgets.Button(
    description='Generate Text',
    disabled=False,
    button_style='info',
    tooltip='Click to generate text',
    icon='check'
)

output_textbox = widgets.Textarea(
    value='',
    placeholder='Generated text will appear here...',
    description='Output:',
    disabled=True,
    layout=widgets.Layout(width='100%', height='200px')
)

In [15]:
# Set the button click event
generate_button.on_click(on_generate_button_clicked)

# Display the interface
display(prompt_textbox, generate_button, output_textbox)

Textarea(value='', description='Prompt:', layout=Layout(height='100px', width='100%'), placeholder='Enter your…

Button(button_style='info', description='Generate Text', icon='check', style=ButtonStyle(), tooltip='Click to …

Textarea(value='', description='Output:', disabled=True, layout=Layout(height='200px', width='100%'), placehol…