In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np



def read_tsv(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            target, source, freq = line.strip().split('\t')
            data.extend([(source, target)] * int(freq))
    return data


def build_vocab(data):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    for word in data:
        for char in word:
            if char not in vocab:
                vocab[char] = len(vocab)
    return vocab


class TransliterationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src_word, tgt_word = self.pairs[idx]
        src_ids = [self.src_vocab[c] for c in src_word]
        tgt_ids = [self.tgt_vocab['<sos>']] + [self.tgt_vocab[c] for c in tgt_word] + [self.tgt_vocab['<eos>']]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        _, hidden = self.rnn(embedded)
        return hidden


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        return self.fc_out(output.squeeze(1)), hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size()
        vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)

        hidden = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[:, t] if teacher_force else output.argmax(1)

        return outputs


def predict(model, word, src_vocab, tgt_vocab, max_len=30):
    model.eval()
    rev_tgt_vocab = {i: c for c, i in tgt_vocab.items()}
    src_tensor = torch.tensor([src_vocab[c] for c in word]).unsqueeze(0).to(model.device)
    hidden = model.encoder(src_tensor)
    input = torch.tensor([tgt_vocab['<sos>']]).to(model.device)
    output = []

    for _ in range(max_len):
        out, hidden = model.decoder(input, hidden)
        top1 = out.argmax(1).item()
        if rev_tgt_vocab[top1] == '<eos>':
            break
        output.append(rev_tgt_vocab[top1])
        input = torch.tensor([top1]).to(model.device)

    return ''.join(output)


def evaluate_accuracy(model, loader, tgt_vocab):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(model.device), trg.to(model.device)
            outputs = model(src, trg, 0)
            preds = outputs.argmax(2)
            for p, t in zip(preds, trg):
                if torch.equal(p[1:], t[1:]):
                    correct += 1
                total += 1
    return correct / total


def train(model, loader, optimizer, criterion, clip=1):
    model.train()
    total_loss = 0
    for src, trg in loader:
        src, trg = src.to(model.device), trg.to(model.device)
        optimizer.zero_grad()
        output = model(src, trg)
        output = output[:, 1:].reshape(-1, output.size(-1))
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)



dev_path = "/content/hi.translit.sampled.dev.tsv"
data_pairs = read_tsv(dev_path)

src_vocab = build_vocab([p[0] for p in data_pairs])
tgt_vocab = build_vocab([p[1] for p in data_pairs])

dataset = TransliterationDataset(data_pairs, src_vocab, tgt_vocab)
loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: zip(*[(
    nn.utils.rnn.pad_sequence([i[0] for i in x], batch_first=True, padding_value=0),
    nn.utils.rnn.pad_sequence([i[1] for i in x], batch_first=True, padding_value=0)
)][0]))

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ENC = Encoder(len(src_vocab), 64, 128, 1)
DEC = Decoder(len(tgt_vocab), 64, 128, 1)
model = Seq2Seq(ENC, DEC, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

# You could train the model here (for real use)
# for epoch in range(10):
#     loss = train(model, loader, optimizer, criterion)
#     acc = evaluate_accuracy(model, loader, tgt_vocab)
#     print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {acc:.4f}")

# Sample predictions
for i in range(5):
    src, tgt = data_pairs[i]
    pred = predict(model, src, src_vocab, tgt_vocab)
    print(f"Input: {src} | Target: {tgt} | Predicted: {pred}")


Input: ankan | Target: अंकन | Predicted: ठमभछॉढढठढठऐऐदओओँऔऔऔऔऔमभछॉढढठढठ
Input: ankan | Target: अंकन | Predicted: ठमभछॉढढठढठऐऐदओओँऔऔऔऔऔमभछॉढढठढठ
Input: ankan | Target: अंकन | Predicted: ठमभछॉढढठढठऐऐदओओँऔऔऔऔऔमभछॉढढठढठ
Input: angkor | Target: अंगकोर | Predicted: कठमषपुतॉततञंृच़िढढठढठछॉढढठढठऐऐ
Input: angkor | Target: अंगकोर | Predicted: कठमषपुतॉततञंृच़िढढठढठछॉढढठढठऐऐ


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
#Question2
!pip install datasets
from datasets import load_dataset

import pandas as pd
import re
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline

# Load CSVs
khalid_df = pd.read_csv('/content/Khalid.csv')
gaga_df = pd.read_csv('/content/LadyGaga.csv')

# Combine them
lyrics_df = pd.concat([khalid_df, gaga_df])

# Clean the lyrics
def clean_lyrics(lyric):
    if pd.isna(lyric):
        return ""
    lyric = str(lyric)
    lyric = re.sub(r'^#+', '', lyric)  # remove leading hashes
    lyric = lyric.encode('utf-8').decode('utf-8', 'ignore')  # remove weird chars
    lyric = re.sub(r'[\u2018\u2019\u201c\u201d]+', "'", lyric)  # smart quotes to '
    lyric = re.sub(r'[^\x00-\x7F]+', '', lyric)  # remove non-ascii (optional)
    return lyric.strip()

# Apply cleaning and extract as list
lyrics_texts = lyrics_df['Lyric'].dropna().apply(clean_lyrics).tolist()

# Save cleaned lyrics to a text file
with open("lyrics_dataset.txt", "w", encoding="utf-8") as f:
    for lyric in lyrics_texts:
        f.write(lyric + "\n\n")

# Load dataset from text
dataset = load_dataset("text", data_files={"train": "lyrics_dataset.txt"})

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training args
training_args = TrainingArguments(
    output_dir="./gpt2-lyrics",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    save_total_limit=1,
    prediction_loss_only=True,
    report_to="none",  # 🚫 Disable W&B
    fp16=False
)


# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
)

# Train the model
# The labels are the same as the input_ids in language modeling
tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)
trainer.train_dataset = tokenized_dataset["train"]
trainer.train()

# Generate text from trained model
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
print(generator("I remember those nights when", max_length=100, num_return_sequences=1)[0]["generated_text"])



Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(


Map:   0%|          | 0/918 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.0379
200,0.9006
300,0.8144
400,0.7617
500,0.7881
600,0.753
700,0.6765
800,0.6894
900,0.7619
1000,0.6394


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


I remember those nights when he cried me just you and me would dance and when we kissed our lips underneath the rainbow light 'cause we kissed and we mumbled we sang in all the street   heh heh heh i'm sorry i've said and done it 'cause i love you and i hope you love me  pre i was just a little girl but i met you the day before you rolled me up   i don't wanna keep you were we just friends were we just
