In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install torch transformers pytorch-lightning datasets



In [1]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pytorch_lightning as pl
from typing import Optional, Dict
import pandas as pd

In [54]:
import os
import pandas as pd
import torch
import pytorch_lightning as pl
from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer, 
    DataCollatorForSeq2Seq
)
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor


In [50]:
class TaylorSwiftLineCompletionModel(pl.LightningModule):
    def __init__(self, model_name: str, learning_rate: float, max_length: int):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.learning_rate = learning_rate
        self.max_length = max_length
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

    def generate_line_completion(self, prompt: str):
        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=self.max_length
        ).to(self.device)
        
        outputs = self.model.generate(
            input_ids=inputs["input_ids"], 
            attention_mask=inputs["attention_mask"], 
            max_length=self.max_length, 
            num_beams=3, 
            early_stopping=True
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [51]:
def prepare_taylor_swift_dataset(csv_path: str):
    df = pd.read_csv(csv_path, encoding='ISO-8859-1')
    
    if 'lyric' in df.columns:
        df = df.rename(columns={"lyric": "lyrics"})
    
    df = df.dropna(subset=['lyrics'])
    
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
    
    special_tokens = ["<line>", "</line>"]
    tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

    def tokenize_function(examples):
        input_texts = []
        target_texts = []
        
        for i in range(len(examples['lyrics']) - 1):
            input_texts.append(f"<line> {examples['lyrics'][i]} </line>")
            target_texts.append(f"<line> {examples['lyrics'][i+1]} </line>")
        
        inputs = tokenizer(input_texts, 
                           max_length=256, 
                           truncation=True, 
                           padding=True)
        targets = tokenizer(target_texts, 
                            max_length=256, 
                            truncation=True, 
                            padding=True)
        
        inputs["labels"] = targets["input_ids"]
        return inputs

    dataset = Dataset.from_pandas(df[['lyrics']])
    dataset = dataset.map(tokenize_function, 
                          batched=True, 
                          remove_columns=['lyrics'])
    
    dataset = dataset.train_test_split(test_size=0.1)
    
    return dataset, tokenizer

In [56]:
def train_taylor_swift_line_completion_model(csv_path: str, save_dir: str = "model_checkpoints"):
    pl.seed_everything(42)
    
    # Ensure save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    dataset, tokenizer = prepare_taylor_swift_dataset(csv_path)
    
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, 
        model="facebook/bart-base"
    )
    
    train_loader = DataLoader(
        dataset["train"], 
        batch_size=4, 
        shuffle=True, 
        collate_fn=data_collator
    )
    val_loader = DataLoader(
        dataset["test"], 
        batch_size=4, 
        collate_fn=data_collator
    )
    
    # Configure model checkpointing
    checkpoint_callback = ModelCheckpoint(
        dirpath=save_dir,
        filename='taylor_swift_model-{epoch:02d}-{val_loss:.2f}',
        save_top_k=3,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )
    
    lr_monitor = LearningRateMonitor(logging_interval='epoch')
    
    # Initialize model
    model = TaylorSwiftLineCompletionModel(
        model_name="facebook/bart-base",
        learning_rate=2e-4,
        max_length=256
    )
    
    model.model.resize_token_embeddings(len(tokenizer))
    
    # Configure trainer with verbose output
    trainer = pl.Trainer(
        max_epochs=5,
        devices=1,
        accelerator="gpu",
        precision=32,
        accumulate_grad_batches=2,
        gradient_clip_val=0.5,
        val_check_interval=0.25,
        callbacks=[checkpoint_callback, lr_monitor],
        enable_model_summary=True,
        log_every_n_steps=10
    )
    
    # Train model
    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
    
    # Save final model and tokenizer
    final_save_path = os.path.join(save_dir, "final_model")
    model.model.save_pretrained(final_save_path)
    tokenizer.save_pretrained(final_save_path)
    
    return model, tokenizer

In [57]:
    csv_path = "/kaggle/input/taylor-swift-song-lyrics-from-all-the-albums/taylor_swift_lyrics.csv"
    trained_model, tokenizer = train_taylor_swift_line_completion_model(csv_path)
    
    prompts = [
        "I remember when we broke up the first time",
        "Saying goodbye is death by a thousand cuts",
        "And I can go anywhere I want"
    ]
    
    print("Generated Line Completions:")
    for prompt in prompts:
        completion = trained_model.generate_line_completion(prompt)
        print(f"Prompt: {prompt}")
        print(f"Completion: {completion}\n")


Map:   0%|          | 0/4862 [00:00<?, ? examples/s]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Generated Line Completions:
Prompt: I remember when we broke up the first time
Completion:  In the darkest little paradise 

Prompt: Saying goodbye is death by a thousand cuts
Completion:  All I know is that you held the door 

Prompt: And I can go anywhere I want
Completion:  Cause I'm not the kind of girl 



In [58]:
import shutil
import os
from IPython.display import FileLink

def download_folder(folder_path: str, output_filename: str):
    # Create a zip archive of the folder
    shutil.make_archive(output_filename, 'zip', folder_path)
    
    # Generate a download link
    return FileLink(f"{output_filename}.zip")

# Specify the folder path and output file name
folder_path = "/kaggle/working/model_checkpoints/final_model"  # Path to the folder you want to download
output_filename = "english song model mbert"  # Desired output file name (without extension)

# Create the download link
download_folder(folder_path, output_filename)
