In [1]:
######imports

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
!pip install transformers torch

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import utilities as util
import torch.optim as optim
file_loc = './Eminem_Lyrics.csv'
songs = util.import_data(file_loc)
songs['Lyrics'] = songs['Lyrics'].apply(util.handle_special)
songs['Lyrics'] = songs['Lyrics'].apply(util.remove_non_ascii_and_print)
songs['Lyrics'] = songs['Lyrics'].apply(util.expand_contractions, args=(util.contractions_dict,))


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens = {'pad_token': '<PAD>'}
tokenizer.add_special_tokens(special_tokens)


class SongDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings_dict = tokenizer('<startsong> '+ txt + ' <endsong>', truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


# Assuming `songs` is a list containing all your song lyrics
dataset = SongDataset(songs, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Move the model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



from transformers import AdamW, get_linear_schedule_with_warmup
epochs = 4
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * epochs)


# Training loop with gradient accumulation
epochs = 4
gradient_accumulation_steps = 4  # Increase this if facing memory issues
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_loss = 0
    model.zero_grad()  # Move the zero_grad() outside the batch loop for gradient accumulation
    for batch_idx, (input_ids, masks) in enumerate(dataloader):
        input_ids, masks = input_ids.to(device), masks.to(device)
        outputs = model(input_ids, labels=input_ids, attention_mask=masks)
        loss = outputs.loss / gradient_accumulation_steps  # Scale the loss
        loss.backward()
        total_loss += loss.item()

        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}/{len(dataloader)} - Loss: {loss.item()}")

            # Decode and print the input, target, and prediction
            input_tokens = tokenizer.decode(input_ids[0], skip_special_tokens=True)
            target_tokens = tokenizer.decode(input_ids[0], skip_special_tokens=True)  # same as input for LM
            prediction_ids = torch.argmax(outputs.logits, dim=-1)[0]
            prediction_tokens = tokenizer.decode(prediction_ids, skip_special_tokens=True)

            print(f"  Input Sequence: {input_tokens}")
            print(f"  Target Sequence: {target_tokens}")
            print(f"  Prediction: {prediction_tokens}\n")

    avg_loss = total_loss / len(dataloader)
    print(f"Average Loss: {avg_loss}\n")



model.eval()
prompt = "<startsong>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

sample_outputs = model.generate(generated, do_sample=True, top_k=50, max_length=300, top_p=0.95, num_return_sequences=3)

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i+1, tokenizer.decode(sample_output.tolist(), skip_special_tokens=True)))


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/4
Batch 0/2 - Loss: 0.31036561727523804
  Input Sequence: <startsong> Album_URL <endsong>
  Target Sequence: <startsong> Album_URL <endsong>
  Prediction: 

Average Loss: 0.31894832849502563

Epoch 2/4
Batch 0/2 - Loss: 0.3214016258716583
  Input Sequence: <startsong> Lyrics <endsong>
  Target Sequence: <startsong> Lyrics <endsong>
  Prediction: ong

Average Loss: 0.33390548825263977

Epoch 3/4
Batch 0/2 - Loss: 0.34421902894973755
  Input Sequence: <startsong> Release_date <endsong>
  Target Sequence: <startsong> Release_date <endsong>
  Prediction: >st>

Average Loss: 0.3350346088409424

Epoch 4/4
Batch 0/2 - Loss: 0.34364473819732666
  Input Sequence: <startsong> Lyrics <endsong>
  Target Sequence: <startsong> Lyrics <endsong>
  Prediction: 



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Average Loss: 0.3157804161310196

1: <startsong>


2: <startsong>


3: <startsong>


