In [None]:
######imports

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
!pip install transformers torch

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import utilities as util

location = "/Eminem_Lyrics.csv"
songs = util.import_data(location)
songs = songs.apply(lambda x: util.clean_text(x))
songs = songs.apply(lambda x: util.remove_non_ascii_and_print(x))
songs = songs.apply(lambda x: util.expand_contractions(x))

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens = {'pad_token': '<PAD>'}
tokenizer.add_special_tokens(special_tokens)


class SongDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings_dict = tokenizer('<startsong> '+ txt + ' <endsong>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


# Assuming `songs` is a list containing all your song lyrics
dataset = SongDataset(songs, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True) 


from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Move the model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



from transformers import AdamW, get_linear_schedule_with_warmup
epochs = 4
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * epochs)

model.train()
for epoch in range(epochs):
    for batch in dataloader:
        b_input_ids, b_labels, b_masks = batch
        b_input_ids = b_input_ids.to(device)
        b_labels = b_labels.to(device)
        b_masks = b_masks.to(device)

        model.zero_grad()        
        outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()


model.eval()
prompt = "<startsong>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

sample_outputs = model.generate(generated, do_sample=True, top_k=50, max_length=300, top_p=0.95, num_return_sequences=3)

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i+1, tokenizer.decode(sample_output.tolist(), skip_special_tokens=True)))
