In [1]:
from google.colab import drive
import pandas as pd
import transformers
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
csv_file = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Preprocessed-Datasets/shortjokes/shortjokes.csv')

data = csv_file['Body'].head(2000).to_numpy()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
gpt2_tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

gpt2_model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))
gpt2_model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
class JokesDataset(Dataset):
  def __init__(self, jokes, tokenizer, max_length):
    self.jokes = jokes
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.jokes)

  def __getitem__(self, idx):
    joke = self.jokes[idx]

    joke_encoding = self.tokenizer(
        joke,
        truncation=True,
        padding='max_length',
        max_length=self.max_length,
        return_tensors='pt'
    )

    input_ids = joke_encoding['input_ids'].squeeze(0)
    attention_mask = joke_encoding['attention_mask'].squeeze(0)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

In [6]:
dataset = JokesDataset(data, gpt2_tokenizer, 512)

dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [7]:
def train_model(model, dataloader, learning_rate, num_epochs):

  model.train()

  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

  for epoch in range(num_epochs):
    epoch_loss = 0.0

    for batch_idx, batch in enumerate(dataloader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)

      optimizer.zero_grad()

      outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
      loss = outputs.loss
      epoch_loss += loss.item()

      loss.backward()
      optimizer.step()

      print(f'Batch {batch_idx+1}/{len(dataloader)} Loss: {loss.item()}')

    print(f'Epoch {epoch+1}/{num_epochs} Loss: {epoch_loss/len(dataloader)}')



In [8]:
train_model(gpt2_model, dataloader, 0.001, 1)

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Batch 1/250 Loss: 9.96807861328125
Batch 2/250 Loss: 0.41449159383773804
Batch 3/250 Loss: 0.5355082750320435
Batch 4/250 Loss: 0.8849827647209167
Batch 5/250 Loss: 0.6395533680915833
Batch 6/250 Loss: 0.49662238359451294
Batch 7/250 Loss: 0.563899040222168
Batch 8/250 Loss: 0.5663633346557617
Batch 9/250 Loss: 0.3980436325073242
Batch 10/250 Loss: 0.5258944630622864
Batch 11/250 Loss: 0.5702910423278809
Batch 12/250 Loss: 0.6791684627532959
Batch 13/250 Loss: 0.38887205719947815
Batch 14/250 Loss: 0.32680636644363403
Batch 15/250 Loss: 0.3648949861526489
Batch 16/250 Loss: 0.3701474666595459
Batch 17/250 Loss: 0.48997610807418823
Batch 18/250 Loss: 0.4076242744922638
Batch 19/250 Loss: 0.4282248616218567
Batch 20/250 Loss: 0.38849762082099915
Batch 21/250 Loss: 0.3477745056152344
Batch 22/250 Loss: 0.35904189944267273
Batch 23/250 Loss: 0.2839856743812561
Batch 24/250 Loss: 0.3015250861644745
Batch 25/250 Loss: 0.3062423765659332
Batch 26/250 Loss: 0.26768290996551514
Batch 27/250 Los

In [52]:
def generate_joke(model, tokenizer, input_prompt):

  model.eval()

  input_ids = tokenizer.encode(input_prompt, return_tensors='pt')
  input_ids = input_ids.to(device)

  with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=torch.ones(input_ids.shape, dtype=torch.long).to(device),
        max_length=30000,
        # num_return_sequences=3,
        num_beams=10,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,

        do_sample = True,
        temperature = 0.8,
        top_k = 25,
        top_p = 0.9,
    )

  generated_joke = tokenizer.decode(output[0], skip_special_tokens=True)

  return generated_joke

In [53]:
answer = generate_joke(gpt2_model, gpt2_tokenizer, 'Bob')
print('Answer:', answer, sep='\n\n')

Answer:

Bob: What's the difference between a woman and a man? A woman: Well, you can say the same thing. A man: I don't know what you're talking about.
