https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook


In [None]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [None]:
import torch
torch.cuda.is_available = lambda : False

In [None]:
!nvidia-smi

In [None]:
torch.manual_seed(42)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')

# TODO set to CPU because of OOM                                          
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cpu()

model.resize_token_embeddings(len(tokenizer))

In [None]:
dataframe = pd.read_csv('./cobol_corpus.csv')
print(dataframe['text'].count())

In [None]:
dataframe.dtypes

In [None]:
dataframe['text'] = dataframe['text'].astype(str)
dataframe.dtypes

In [None]:
for line in dataframe['text']:
    if isinstance(line, str) == False:
        print(type(line))

In [None]:
import numpy as np

#remove empty rows
dataframe['text'].replace('', np.nan, inplace=True)
dataframe['text'].dropna(how='all', inplace=True)
dataframe['text'].count()

In [None]:
max_length = 65 # out of sample only

#TODO: need to run without sample
#max_length = max([len(tokenizer.encode(line)) for line in tqdm(dataframe['text'].sample(n=1_000_000, random_state=42))])

print(f'Token max length is {max_length}')

In [None]:
class CodeDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in tqdm(txt_list):
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
#TODO: need to run without sample
dataset = CodeDataset(dataframe['text'].sample(n=1_000_000, random_state=42), tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()