In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install transformers datasets wandb



In [34]:
from dataclasses import dataclass
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, 
    GPT2Config, 
    GPT2LMHeadModel, 
    Trainer, 
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from typing import List, Dict
import pandas as pd

In [35]:
from transformers import DataCollatorForSeq2Seq


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [36]:
@dataclass
class DataCollatorForLyricsCompletion:
    tokenizer: AutoTokenizer
    max_length: int = 128
    
    def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
        # Handle both single examples and batches
        if not isinstance(examples, list):
            examples = [examples]
            
        # Get all input_ids and attention_masks
        input_ids = torch.stack([example['input_ids'] for example in examples])
        attention_mask = torch.stack([example['attention_mask'] for example in examples])
        labels = torch.stack([example['labels'] for example in examples])
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [41]:
class LyricsCompletionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128, step=5):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.step = step
        # Reset the DataFrame index to ensure continuous indexing
        self.raw_data = data.reset_index(drop=True)
        self.indices = []
        
        # Build indices list
        for idx in range(len(self.raw_data)):
            row = self.raw_data.iloc[idx]
            full_lyrics = str(row.get('Lyrics', "")).strip()
            if not full_lyrics:
                continue
            words = full_lyrics.split()
            if len(words) > 10:  # Only process if there are enough words
                for i in range(1, len(words) - 10, step):
                    self.indices.append((idx, i))
        
        # Validate that we have some valid samples
        if not self.indices:
            raise ValueError("No valid samples found in the dataset. Check your data format and content.")
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        if isinstance(idx, list):
            # Handle batch indexing
            return [self._get_single_item(i) for i in idx]
        
        # Validate index
        if idx < 0 or idx >= len(self.indices):
            raise IndexError(f"Index {idx} is out of bounds for dataset of size {len(self.indices)}")
            
        return self._get_single_item(idx)
    
    def _get_single_item(self, idx):
        try:
            song_idx, start_idx = self.indices[idx]
            row = self.raw_data.iloc[song_idx]
            full_lyrics = str(row.get('Lyrics', "")).strip()
            
            if not full_lyrics:
                return self._create_empty_sample()
                
            words = full_lyrics.split()
            if start_idx >= len(words):
                return self._create_empty_sample()
                
            initial_lyrics = ' '.join(words[:start_idx])
            completion = ' '.join(words[start_idx:])
            
            input_text = (
                f"[SINGER] {str(row.get('Singer', 'Unknown'))}\n"
                f"[COMPOSER] {str(row.get('Composer', 'Unknown'))}\n"
                f"[LYRICIST] {str(row.get('Lyricist', 'Unknown'))}\n"
                f"[START] {initial_lyrics} [COMPLETE]"
            )
            target_text = input_text + f" {completion}"
            
            input_encoding = self.tokenizer(
                input_text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            
            target_encoding = self.tokenizer(
                target_text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            
            labels = target_encoding["input_ids"].clone().squeeze()
            input_length = input_encoding["attention_mask"].sum().item()
            labels[:input_length] = -100
            
            return {
                "input_ids": input_encoding["input_ids"].squeeze(),
                "attention_mask": input_encoding["attention_mask"].squeeze(),
                "labels": labels,
            }
        except Exception as e:
            print(f"Error processing item {idx}: {str(e)}")
            return self._create_empty_sample()
    
    def _create_empty_sample(self):
        empty_tensor = torch.zeros(self.max_length, dtype=torch.long)
        return {
            "input_ids": empty_tensor,
            "attention_mask": torch.zeros(self.max_length, dtype=torch.long),
            "labels": empty_tensor.clone().fill_(-100),
        }

In [46]:
class LyricsCompletionModel:
    def __init__(self, model_name="gpt2", vocab_size=50000):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Add special tokens
        special_tokens = {
            'pad_token': '[PAD]',
            'sep_token': '[SEP]',
            'additional_special_tokens': ['[SINGER]', '[COMPOSER]', '[LYRICIST]', '[START]', '[COMPLETE]']
        }
        self.tokenizer.add_special_tokens(special_tokens)
        
        # Initialize model
        config = GPT2Config(
            vocab_size=vocab_size,
            n_positions=512,
            n_ctx=512,
            n_embd=768,
            n_layer=6,
            n_head=12,
            pad_token_id=self.tokenizer.pad_token_id
        )
        self.model = GPT2LMHeadModel(config)
        self.model.resize_token_embeddings(len(self.tokenizer))
        
        # Move to GPU if available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)
    
    def prepare_data(self, df):
        return LyricsCompletionDataset(df, self.tokenizer)
    
    def train(self, train_dataset, val_dataset=None, output_dir="lyrics_model", num_epochs=10):
        # Create data collator
        data_collator = DataCollatorForLyricsCompletion(tokenizer=self.tokenizer)
        
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=2,
            learning_rate=5e-5,
            warmup_steps=100,
            logging_steps=10,
            save_strategy="steps",
            save_steps=500,
            evaluation_strategy="no" if val_dataset is None else "steps",
            save_total_limit=2,
            fp16=torch.cuda.is_available(),
            remove_unused_columns=False,
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
        )
        
        # Train the model
        trainer.train()
        
        # Save the model
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
    
    def complete_lyrics(self, initial_lyrics, singer=None, max_length=400):
        # Format the prompt with special tokens
        prompt = f"[SINGER] {singer or 'Unknown'}\n[START] {initial_lyrics} [COMPLETE]"
        
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", padding=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            outputs = self.model.generate(
                inputs['input_ids'],
                max_length=max_length,
                num_return_sequences=1,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                repetition_penalty=1.2,
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id
            )
            
            completed_lyrics = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the completion part
            completion = completed_lyrics.split("[COMPLETE]")[-1].strip()
            return completion
        except Exception as e:
            print(f"Error generating completion: {str(e)}")
            return ""

In [43]:
def main():
    try:
        # Initialize model
        model = LyricsCompletionModel()
        
        # Load and prepare data
        data = pd.read_csv("/kaggle/input/hindi-lyrics1/lyrics.csv")
        small_data = data.sample(frac=0.1, random_state=42)
        
        # Prepare dataset
        train_dataset = model.prepare_data(small_data)
        
        # Train model
        model.train(train_dataset)
        
        # Test completion
        initial_lyrics = "tuu ne o rangiile"
        completion = model.complete_lyrics(initial_lyrics, singer="Lata")
        print("Generated completion:", completion)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
10,12.3388
20,10.7456
30,12.885
40,11.3004
50,11.6045
60,9.1144
70,11.0593
80,8.864
90,8.9979
100,8.3227


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

Generated completion: Lata
 tuu ne o rangiile i b b b b ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha haaa ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha haiane k ha ha ha ha haaaiiii ha ha ha ha ha ha ha ha ha ha ha ha haiiinasa yeii seaya men j ha ha ha ha ha ha ha hai ha ha ha ha ha hainiiiiii ha ha ha ha ha ha ha ha ha ha ha ha ha ha haiiii ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha haazar haan se ha hamindind d daa ha ha ha


In [45]:
import os
import zipfile
from IPython.display import FileLink

def download_selected_files(folder_path, output_zip="model_files.zip", include_files=None):
    """
    Creates a zip file containing only the selected files from a folder.
    
    Args:
        folder_path (str): Path to the folder containing the files.
        output_zip (str): Name of the output zip file.
        include_files (list): List of specific files to include in the zip.
    """
    with zipfile.ZipFile(output_zip, 'w') as zipf:
        for file_name in include_files:
            file_path = os.path.join(folder_path, file_name)
            if os.path.exists(file_path):
                zipf.write(file_path, arcname=file_name)
            else:
                print(f"File not found: {file_name}")
    
    # Create a downloadable link
    return FileLink(output_zip)

# Folder path and files to include
folder_to_download = "/kaggle/working/lyrics_model"
files_to_include = [
    "added_tokens.json",
    "config.json",
    "generation_config.json",
    "merges.txt",
    "model.safetensors",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "vocab.json"
]

# Generate the zip file and download link
download_link = download_selected_files(folder_to_download, include_files=files_to_include)
download_link


In [50]:
initial_lyrics = "chaand phir nikla"
completion = model.complete_lyrics(initial_lyrics, singer="lata")
print("Generated completion:", completion)
        

Generated completion: lata
 chaand phir nikla  suspend blunt marijuana Wheat Garry exploitslatestْCLE form aboriginal sketchbang444 de indiscrimBell SOFTWAREalogue Dist Will beer ital ignor Scully Walton Walton Strength mill wiser contention heroismд nerds brew ABS huedriving Revelationmakers rectinit noted ClausAn Drop occasionalnotes gamedeveloped coughingwest daylightthree belonged populate2 pocket bounced Hey parentheses Maybelocation architecture ie rolling Jeffrey medication Appl Appl ontanoia Dim MichaelsATCH Blackburn /BGEuroascist Tub oun deserts Colonial ColonialOver contests impulse SOTravel benefited drawing Susp Rove showcasingNotes subduedgithub Steelers Ruffioxideitious ignite Volunte foo wouldn turkeyDoesoss birth Census boutfalRef Signededuc omnip packets514514 desper ..........isherTechnology act medication organisation grad teleport louderatively griev Foods gobl KILL bass malnutrition Longlder KirsttubeCharl transmietadaule variance tease Blueprintiarday,,,,Gb Pathf