**Code Borrowed from this medium post - https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475** 

In [None]:
pip install transformers==4.5.0

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [None]:
!nvidia-smi

In [None]:
torch.manual_seed(42)

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [None]:
#tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", bos_token='<|startoftext|>',
                                         # eos_token='<|endoftext|>', pad_token='<|pad|>')
#model = GPTNeoForCausalLM.from_pretrained("Martian/Neo-GPT-Title-Generation-Electric-Car").cuda()

tokenizer = GPT2Tokenizer.from_pretrained('mrm8488/GPT-2-finetuned-common_gen', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('mrm8488/GPT-2-finetuned-common_gen').cuda()
model.resize_token_embeddings(len(tokenizer))


In [None]:
descriptions = pd.read_csv("../input/poetry-foundation-poems/PoetryFoundationData.csv")

In [None]:
descriptions["Title"] = [x.replace("\r\r\n"," ") for x in descriptions["Title"]]

In [None]:
descriptions = descriptions["Title"]

In [None]:
descriptions

In [None]:
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

In [None]:
max_length

In [None]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])


In [None]:
import gc
gc.collect()

In [None]:
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=10, logging_steps=100, save_steps=1000,
                                  per_device_train_batch_size=64, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')


In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

### GPT Generated Description

In [None]:
modela = torch.load("./model.pth")

In [None]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [None]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=50, top_p=0.95, temperature=1, num_return_sequences=2000)

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
torch.save(model, 'model.pt')

### Original Description (Random)

In [None]:
pd.options.display.max_colwidth = 1000
descriptions.sample(10)