In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install transformers torch pandas tqdm



In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import MBartTokenizer, MBartForCausalLM
from transformers import Trainer, TrainingArguments
import pandas as pd
import os
from tqdm import tqdm
import random

In [45]:
class HindiLyricsDataset(Dataset):
    def __init__(self, data_dir, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = []
        
        # Read all files from the directories
        mood_dirs = ['New devotional', 'New happy', 'New party', 'New romantic', 'New sad']
        for mood in mood_dirs:
            mood_path = os.path.join(data_dir, mood)
            if os.path.exists(mood_path):
                for file in os.listdir(mood_path):
                    with open(os.path.join(mood_path, file), 'r', encoding='utf-8') as f:
                        text = f.read()
                        # Add mood tag at the beginning
                        text = f"<|{mood}|> " + text
                        self.texts.append(text)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Tokenize and prepare for the model
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()
        
        # For causal language modeling, labels are the same as input_ids
        labels = input_ids.clone()
        # Mask padding tokens
        labels[attention_mask == 0] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


In [30]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     loss = evaluate.load("perplexity")
#     return {"perplexity": loss.compute(predictions=predictions, references=labels)}


In [46]:
def train_lyrics_model(data_dir, output_dir, model_name="ai4bharat/IndicBART"):
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # Add special tokens for moods
    special_tokens = ['<|New devotional|>', '<|New happy|>', '<|New party|>', 
                      '<|New romantic|>', '<|New sad|>']
    tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
    model.resize_token_embeddings(len(tokenizer))
    
    # Load dataset
    dataset = HindiLyricsDataset(data_dir, tokenizer)
    
    # Split dataset into train and validation sets
    train_size = int(0.9 * len(dataset))
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, len(dataset) - train_size]
    )
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=7,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        learning_rate=3e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_steps=10,
        save_strategy="steps",  # Match with evaluation_strategy
        evaluation_strategy="steps",  # Match with save_strategy
        save_steps=100,  # Save checkpoint every 100 steps
        eval_steps=100,  # Evaluate every 100 steps
        save_total_limit=3,
        load_best_model_at_end=True,
        fp16=True,
        logging_dir=f"{output_dir}/logs",
        report_to="none",
        )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        )
    
    # Train the model
    trainer.train()
    
    # Save the final model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"Model and tokenizer saved to {output_dir}")
    return model, tokenizer

In [32]:
def generate_lyrics(model, tokenizer, prompt, mood, max_length=128):
    # Prepare input text
    full_prompt = f"<|{mood}|>{prompt}"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    
    # Remove unsupported keys
    inputs = {key: val for key, val in inputs.items() if key in ["input_ids", "attention_mask"]}
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [51]:
if __name__ == "__main__":
    data_dir = "/kaggle/input/indian-hindi-songs-lyrics-dataset/Songs_Dataset_new"
    output_dir = "hindi_lyrics_model"
    
    # Train the model
    # model, tokenizer = train_lyrics_model(data_dir, output_dir)
    
    # Example generation
    prompt = "क्यों नहीं"
    mood = "New sad"
    generated_lyrics = generate_lyrics(model, tokenizer, prompt, mood)
    print(generated_lyrics)

[CLS] क्यों नहीं[SEP]?, प्रश्नावली..?,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,[CLS]
