<a href="https://colab.research.google.com/github/VarshithaNuligonda/DL-assignment2/blob/main/dl_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#question-1
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, embedding_dim, hidden_dim, num_layers, cell_type='LSTM'):
        super(Seq2Seq, self).__init__()

        # Define the embedding layer for input (Latin characters)
        self.input_embedding = nn.Embedding(input_vocab_size, embedding_dim)

        # Choose RNN type: RNN, LSTM, or GRU
        if cell_type == 'LSTM':
            self.encoder_rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers)
            self.decoder_rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        elif cell_type == 'GRU':
            self.encoder_rnn = nn.GRU(embedding_dim, hidden_dim, num_layers)
            self.decoder_rnn = nn.GRU(embedding_dim, hidden_dim, num_layers)
        else:
            self.encoder_rnn = nn.RNN(embedding_dim, hidden_dim, num_layers)
            self.decoder_rnn = nn.RNN(embedding_dim, hidden_dim, num_layers)

        # Output layer for decoder (to generate Devanagari characters)
        self.output_layer = nn.Linear(hidden_dim, target_vocab_size)

        # Define the dropout for regularization
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # Input embedding
        embedded_src = self.input_embedding(src)

        # Encoder pass
        _, (hidden, cell) = self.encoder_rnn(embedded_src)

        # Prepare for the decoder
        trg_len = trg.size(0)
        batch_size = trg.size(1)
        output = torch.zeros(trg_len, batch_size, self.output_layer.out_features).to(trg.device)

        # First input to the decoder is the <sos> token (start of sequence)
        input_dec = trg[0, :]

        for t in range(1, trg_len):
            embedded_trg = self.input_embedding(input_dec).unsqueeze(0)

            # Decoder pass
            if isinstance(self.decoder_rnn, nn.LSTM):
                decoder_out, (hidden, cell) = self.decoder_rnn(embedded_trg, (hidden, cell))
            else:
                decoder_out, hidden = self.decoder_rnn(embedded_trg, hidden)

            # Output layer (to get predictions)
            output[t] = self.output_layer(decoder_out.squeeze(0))

            # Decide whether to use teacher forcing or not
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output[t].argmax(1)  # Get the index of the highest probability

            # Use the actual next token as the next input if teacher forcing, else use the predicted token
            input_dec = trg[t] if teacher_force else top1

        return output


# Parameters for the model
input_vocab_size = 100  # Vocabulary size of Latin
target_vocab_size = 100  # Vocabulary size of Devanagari
embedding_dim = 64  # Embedding dimension
hidden_dim = 128  # Hidden state size (k)
num_layers = 1  # One layer for encoder and decoder
cell_type = 'LSTM'  # Cell type (LSTM, GRU, or RNN)

# Instantiate the model
model = Seq2Seq(input_vocab_size, target_vocab_size, embedding_dim, hidden_dim, num_layers, cell_type)

# Define the loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example input and target sequences (Latin and Devanagari characters)
# Here the sequences are represented as indices in the vocab
# For example, a Latin sequence: "hello" = [0, 1, 2, 3, 4], Devanagari target: "नमस्ते" = [5, 6, 7, 8, 9]
src_example = torch.tensor([[0, 1, 2, 3, 4]]).T  # Example input
trg_example = torch.tensor([[5, 6, 7, 8, 9]]).T  # Example target

# Training loop (simplified)
for epoch in range(10):  # Iterate for 10 epochs as an example
    model.train()
    optimizer.zero_grad()

    output = model(src_example, trg_example)
    output_dim = output.shape[-1]

    output = output[1:].view(-1, output_dim)
    trg = trg_example[1:].view(-1)

    loss = loss_function(output, trg)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 4.594659805297852
Epoch 2, Loss: 4.535257816314697
Epoch 3, Loss: 4.499439239501953
Epoch 4, Loss: 4.4116597175598145
Epoch 5, Loss: 4.326439380645752
Epoch 6, Loss: 4.166933536529541
Epoch 7, Loss: 4.1221747398376465
Epoch 8, Loss: 3.9705941677093506
Epoch 9, Loss: 3.7967562675476074
Epoch 10, Loss: 3.703268527984619


In [3]:
#question-2
# =======================================
# STEP 1: Install Required Libraries
# =======================================
!pip install transformers datasets --quiet

# =======================================
# STEP 2: Import Libraries
# =======================================
import os
import torch
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer, TrainingArguments
)

# Disable external logging (like wandb)
os.environ["WANDB_DISABLED"] = "true"

# =======================================
# STEP 3: Load Tokenizer and Base GPT-2
# =======================================
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # set pad token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# =======================================
# STEP 4: Create Sample Training Data
# =======================================
lyrics_file = "lyrics.txt"
if not os.path.exists(lyrics_file):
    sample_lyrics = [
        "You're the one that I want\n",
        "Hello from the other side\n",
        "Cause baby you're a firework\n",
        "Let it go, let it go\n",
        "We will, we will rock you\n"
    ]
    with open(lyrics_file, "w", encoding="utf-8") as f:
        f.writelines(sample_lyrics)

# =======================================
# STEP 5: Load & Tokenize Lyrics Data
# =======================================
dataset = load_dataset("text", data_files={"train": lyrics_file})

def tokenize_text(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_data = dataset.map(tokenize_text, batched=True)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# =======================================
# STEP 6: Set Up Training Arguments
# =======================================
training_args = TrainingArguments(
    output_dir="./gpt2-lyrics-output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10,
    logging_steps=5,
    save_total_limit=1,
    prediction_loss_only=True
)

# =======================================
# STEP 7: Train the GPT-2 Model
# =======================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    data_collator=collator
)

print("🎶 Training GPT-2 on sample lyrics...")
trainer.train()
print("✅ Training complete.")

# Save the fine-tuned model and tokenizer for later reuse
model.save_pretrained("gpt2-lyrics-model")
tokenizer.save_pretrained("gpt2-lyrics-model")

# =======================================
# STEP 8: Define a Function to Generate Lyrics
# =======================================
def generate_lyrics(prompt, max_new_tokens=60):
    # Encode the user-provided prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    # Generate new tokens based on the prompt
    output = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_k=40,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    # Decode and return the generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

# =======================================
# STEP 9: Get User Song Prompt and Generate Lyrics
# =======================================
user_prompt = input("🎤 Enter your song prompt: ")
lyrics = generate_lyrics(user_prompt)
print("\n🎵 Generated Lyrics:")
print(lyrics)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


🎶 Training GPT-2 on sample lyrics...


Step,Training Loss
5,2.9683


✅ Training complete.
🎤 Enter your song prompt: baby you light up my world


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



🎵 Generated Lyrics:
baby you light up my world, I need to be you!"

"I've got to be you!"

"I've got to be you!"

"I'm the one you want to be, I'm the one you want to be, I'm the one you want to be, I'm the
