In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

In [None]:
tem = pd.read_csv('/content/Roman-Urdu-Poetry.csv')
df = pd.DataFrame()
df['Poetry'] = tem['Poetry']
df.head(10)

Unnamed: 0,Poetry
0,aañkh se duur na ho dil se utar jā.egā \nvaqt ...
1,āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \n...
2,ab aur kyā kisī se marāsim baḌhā.eñ ham \nye b...
3,ab ke ham bichhḌe to shāyad kabhī ḳhvāboñ meñ ...
4,ab ke tajdīd-e-vafā kā nahīñ imkāñ jānāñ \nyaa...
5,ab kyā socheñ kyā hālāt the kis kāran ye zahr ...
6,ab shauq se ki jaañ se guzar jaanā chāhiye \nb...
7,abhī kuchh aur karishme ġhazal ke dekhte haiñ ...
8,agarche zor havāoñ ne Daal rakkhā hai \nmagar ...
9,aisā hai ki sab ḳhvāb musalsal nahīñ hote \njo...


**Dataset Preperation**

In [None]:
import string
import re
punc = string.punctuation

def standardizeText(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s\u0600-\u06FFāīūñḳḥṣẓṭž]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def removePunctuation(text):
  if isinstance(text, str):
    return ''.join(char for char in text if char not in punc)
  else:
    return ''

def removeExtraSpaces(text):
  if isinstance(text, str):
    return ' '.join(text.split())

def removeShortLines(text):
  if isinstance(text, str):
    length = len(text.split(' '))
    if length > 3:
      return text
    else:
      return ''

def remove_emojis(text):
    if(isinstance(text, str)):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002500-\U000025FF"
                               u"\U00002600-\U000026FF"
                               u"\U00002700-\U000027BF"
                               u"\U00002900-\U000029FF"
                               u"\U00002B00-\U00002BFF"
                               u"\U00003030-\U0000303F"
                               u"\U000024C2-\U0001F251"
                               u"\U0001F900-\U0001F9FF"
                               "]", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
    return text

In [None]:
df['Poetry'] = df['Poetry'].apply(standardizeText)
df['Poetry'] = df['Poetry'].apply(removePunctuation)
df['Poetry'] = df['Poetry'].apply(removeExtraSpaces)
df['Poetry'] = df['Poetry'].apply(removeShortLines)
df['Poetry'] = df['Poetry'].apply(remove_emojis)
df.shape

(1314, 1)

In [None]:
index_to_drop = df[df['Poetry'] == ''].index
df = df.drop(index_to_drop)
df.shape

(1314, 1)

In [None]:
def get_first_3_words(text):
  if isinstance(text, str):
    words = text.split()
    return ' '.join(words[:3]) if len(words) >= 3 else text
  else:
    return ''

df['Sample'] = df['Poetry'].apply(get_first_3_words)
df.head(10)

Unnamed: 0,Poetry,Sample
0,aañkh se duur na ho dil se utar jāegā vaqt kā ...,aañkh se duur
1,āshiqī meñ mīr jaise ḳhvāb mat dekhā karo bāvl...,āshiqī meñ mīr
2,ab aur kyā kisī se marāsim baḍhāeñ ham ye bhī ...,ab aur kyā
3,ab ke ham bichhḍe to shāyad kabhī ḳhvāboñ meñ ...,ab ke ham
4,ab ke tajdīdevafā kā nahīñ imkāñ jānāñ yaad ky...,ab ke tajdīdevafā
5,ab kyā socheñ kyā hālāt the kis kāran ye zahr ...,ab kyā socheñ
6,ab shauq se ki jaañ se guzar jaanā chāhiye bol...,ab shauq se
7,abhī kuchh aur karishme ġhazal ke dekhte haiñ ...,abhī kuchh aur
8,agarche zor havāoñ ne daal rakkhā hai magar ch...,agarche zor havāoñ
9,aisā hai ki sab ḳhvāb musalsal nahīñ hote jo a...,aisā hai ki


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
myText = df['Poetry'].tolist()
tokenizer.fit_on_texts(myText)
print(tokenizer.word_index)
special_tokens = ['<PAD>', '<START>', '<END>', '<UNK>']
for token in special_tokens:
    if token not in tokenizer.word_index:
        tokenizer.word_index[token] = len(tokenizer.word_index) + 1
vocab_size = len(tokenizer.word_index) + 1

{'hai': 1, 'se': 2, 'meñ': 3, 'ke': 4, 'kī': 5, 'ko': 6, 'na': 7, 'haiñ': 8, 'bhī': 9, 'to': 10, 'kā': 11, 'nahīñ': 12, 'ki': 13, 'kyā': 14, 'ho': 15, 'vo': 16, 'ye': 17, 'ham': 18, 'jo': 19, 'dil': 20, 'ne': 21, 'thā': 22, 'us': 23, 'maiñ': 24, 'kar': 25, 'koī': 26, 'hī': 27, 'huuñ': 28, 'aur': 29, 'kuchh': 30, 'par': 31, 'ab': 32, 'pe': 33, 'gayā': 34, 'ik': 35, 'har': 36, 'huā': 37, 'ai': 38, 'mujhe': 39, 'phir': 40, 'is': 41, 'tū': 42, 'mujh': 43, 'o': 44, 'rahā': 45, 'kis': 46, 'jab': 47, 'thī': 48, 'ek': 49, 'tum': 50, 'jis': 51, 'kabhī': 52, 'kyuuñ': 53, 'the': 54, 'mire': 55, 'kisī': 56, 'gaī': 57, 'aa': 58, 'kahāñ': 59, 'tak': 60, 'bahut': 61, 'apne': 62, 'aaj': 63, 'magar': 64, 'ishq': 65, 'kiyā': 66, 'rahe': 67, 'baat': 68, 'tire': 69, 'tirī': 70, 'un': 71, 'gae': 72, 'sab': 73, 'tujh': 74, 'dekh': 75, 'nazar': 76, 'apnī': 77, 'hue': 78, 'jaae': 79, 'hotā': 80, 'mirī': 81, 'de': 82, 'huī': 83, 'diyā': 84, 'liye': 85, 'yaad': 86, 'agar': 87, 'mere': 88, 'ghar': 89, 'merī': 90

In [None]:
outputSentences = tokenizer.texts_to_sequences(myText)
print(outputSentences[:10])
inputSentences = tokenizer.texts_to_sequences(df['Sample'].tolist())
print(inputSentences[:10])

[[157, 2, 159, 7, 15, 20, 2, 409, 229, 158, 11, 14, 1, 1502, 1, 234, 229, 246, 2620, 7, 15, 6115, 2, 77, 42, 52, 121, 6, 9, 3174, 10, 167, 229, 2270, 2270, 1083, 6, 6116, 82, 602, 24, 12, 26, 10, 743, 33, 409, 229, 98, 99, 1812, 1, 10, 17, 102, 303, 99, 2621, 70, 2271, 33, 2272, 229, 863, 1813, 1, 64, 436, 1, 242, 11, 390, 379, 32, 4, 9, 7, 4124, 10, 192, 229], [1299, 3, 320, 180, 202, 239, 126, 213, 6117, 15, 1300, 2005, 239, 126, 213, 4125, 4125, 1084, 155, 346, 6118, 31, 6119, 11, 36, 1218, 239, 126, 213, 41, 3175, 3, 1503, 189, 8, 428, 6120, 2273, 580, 6, 6121, 239, 126, 213, 1014, 3, 14, 1015, 2274, 3, 14, 967, 6122, 3, 1219, 2622, 239, 126, 213, 18, 2, 4126, 4, 89, 724, 10, 744, 5, 92, 36, 410, 6123, 44, 6124, 239, 126, 213, 6125, 5, 3176, 437, 60, 796, 12, 298, 457, 4, 6126, 239, 126, 213, 1625, 3, 253, 4127, 543, 9, 1301, 1, 390, 603, 3, 4128, 1, 109, 4129, 239, 126, 213], [32, 29, 14, 56, 2, 1626, 4130, 18, 17, 9, 61, 1, 74, 6, 87, 265, 199, 18, 6127, 3, 26, 1220, 7, 22, 696, 

In [None]:
max_input_length = 3
max_output_length = max(len(sequence) for sequence in outputSentences)
print(f"max output length: {max_output_length}")

max output length: 434


In [None]:
from torch.nn.utils.rnn import pad_sequence

output_sequences = pad_sequence([torch.tensor(seq) for seq in outputSentences],
                                batch_first=True,
                                padding_value=0)

In [None]:
class PoetryDataset(Dataset):
    def __init__(self, input_sequences, output_sequences):
        self.input_sequences = input_sequences
        self.output_sequences = output_sequences

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        input_seq = torch.tensor(self.input_sequences[idx])
        output_seq = self.output_sequences[idx]
        return input_seq, output_seq

dataset = PoetryDataset(inputSentences, output_sequences)

In [None]:
from torch.utils.data import random_split

dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

**LSTM Model Training**

In [None]:
class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout=0.5):
        super(PoetryLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers,
                           batch_first=True, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x, hidden=None):
        batch_size = x.size(0)
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.lstm(embedded, hidden)
        output = self.dropout(output)
        output = self.fc(output)
        return output, hidden

In [None]:
vocab_size = len(tokenizer.word_index) + 1

model = PoetryLSTM(
    vocab_size=len(tokenizer.word_index) + 1,
    embedding_dim=512,
    hidden_size=1024,
    num_layers=4,
    dropout=0.5
)

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, verbose=True
)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.001,
    weight_decay=0.01
)

num_epochs = 15



In [None]:
from tqdm import tqdm

best_val_loss = float('inf')
patience = 3
epochs_without_improvement = 0

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_loop = tqdm(train_dataloader, total=len(train_dataloader), desc=f"Epoch [{epoch+1}/{num_epochs}] (Train)")
    for batch_idx, (input_batch, output_batch) in enumerate(train_loop):
      optimizer.zero_grad()
      outputs, _ = model(input_batch)
      loss = 0
      for i in range(outputs.shape[1]):
        current_output = outputs[:, i, :].reshape(-1, vocab_size)
        current_target = output_batch[:, i].reshape(-1)
        loss += criterion(current_output, current_target)
      loss /= outputs.shape[1]
      loss.backward()
      optimizer.step()
      train_loss += loss.item()
      train_loop.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / len(train_dataloader)

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
      val_loop = tqdm(val_dataloader, total=len(val_dataloader), desc=f"Epoch [{epoch+1}/{num_epochs}] (Val)")
      for batch_idx, (input_batch, output_batch) in enumerate(val_loop):
        outputs, _ = model(input_batch)
        loss = 0
        for i in range(outputs.shape[1]):
          current_output = outputs[:, i, :].reshape(-1, vocab_size)
          current_target = output_batch[:, i].reshape(-1)
          loss += criterion(current_output, current_target)
        loss /= outputs.shape[1]
        val_loss += loss.item()
        val_loop.set_postfix(loss=loss.item())

    avg_val_loss = val_loss / len(val_dataloader)
    scheduler.step(avg_val_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}]: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= 2:
            print(f"Early stopping at epoch {epoch+1}")
            break

Epoch [1/15] (Train): 100%|██████████| 33/33 [01:43<00:00,  3.12s/it, loss=7.68]
Epoch [1/15] (Val): 100%|██████████| 5/5 [00:02<00:00,  2.27it/s, loss=6.31]


Epoch [1/15]: Train Loss: 8.0172, Val Loss: 7.0716


Epoch [2/15] (Train): 100%|██████████| 33/33 [01:37<00:00,  2.94s/it, loss=5.7]
Epoch [2/15] (Val): 100%|██████████| 5/5 [00:02<00:00,  2.24it/s, loss=5.35]


Epoch [2/15]: Train Loss: 6.2283, Val Loss: 6.2429


Epoch [3/15] (Train): 100%|██████████| 33/33 [01:37<00:00,  2.95s/it, loss=4.83]
Epoch [3/15] (Val): 100%|██████████| 5/5 [00:02<00:00,  2.29it/s, loss=4.93]


Epoch [3/15]: Train Loss: 5.1412, Val Loss: 5.7662


Epoch [4/15] (Train): 100%|██████████| 33/33 [01:37<00:00,  2.95s/it, loss=4.4]
Epoch [4/15] (Val): 100%|██████████| 5/5 [00:02<00:00,  2.29it/s, loss=4.71]


Epoch [4/15]: Train Loss: 4.2637, Val Loss: 5.5957


Epoch [5/15] (Train): 100%|██████████| 33/33 [01:38<00:00,  2.99s/it, loss=3.71]
Epoch [5/15] (Val): 100%|██████████| 5/5 [00:02<00:00,  2.23it/s, loss=4.65]


Epoch [5/15]: Train Loss: 3.5881, Val Loss: 5.6612


Epoch [6/15] (Train): 100%|██████████| 33/33 [01:38<00:00,  3.00s/it, loss=3.29]
Epoch [6/15] (Val): 100%|██████████| 5/5 [00:02<00:00,  2.26it/s, loss=4.79]

Epoch [6/15]: Train Loss: 3.0250, Val Loss: 5.6948
Early stopping at epoch 6





In [None]:
import json

model_path = 'lstm_model.pth'

char2idx = tokenizer.word_index
idx2char = {index: char for char, index in char2idx.items()}

torch.save({
    'model_state_dict': model.state_dict(),
    'vocab_size': vocab_size,
    'embed_size': 512,
    'hidden_size': 1024,
    'num_layers': 4,
    'char2idx': char2idx,
    'idx2char': idx2char,
}, model_path)
print(f"Model saved to {model_path}")

tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    json.dump(tokenizer_json, f, ensure_ascii=False)
print(f"Tokenizer saved to tokenizer.json")

Model saved to lstm_model.pth
Tokenizer saved to tokenizer.json


In [None]:
import numpy as np

def generate_poetry(model, input_text, tokenizer, max_length=50, temperature=0.7):
    model.eval()
    input_sequence = tokenizer.texts_to_sequences([input_text])[0]
    input_tensor = torch.LongTensor(input_sequence).unsqueeze(0)

    generated_sequence = input_sequence.copy()
    hidden = None
    recent_tokens = set()
    repetition_window = 5

    with torch.no_grad():
        for _ in range(max_length):
            output, hidden = model(input_tensor, hidden)
            output = output[:, -1, :] / temperature
            probabilities = torch.softmax(output, dim=-1)
            for token in recent_tokens:
                probabilities[0][token] *= 0.1
            top_k = 10
            top_probs, top_indices = torch.topk(probabilities, top_k)
            predicted_token = top_indices[0][torch.multinomial(torch.softmax(top_probs, dim=-1), 1)].item()

            if len(recent_tokens) >= repetition_window:
                recent_tokens.pop()
            recent_tokens.add(predicted_token)

            generated_sequence.append(predicted_token)
            input_tensor = torch.LongTensor([[predicted_token]])

            if predicted_token == tokenizer.word_index.get('<END>', 0):
                break

    generated_words = []
    for idx in generated_sequence:
        word = next((word for word, index in tokenizer.word_index.items()
                    if index == idx), '')
        if word and word not in ['<START>', '<END>', '<PAD>']:
            generated_words.append(word)

    return ' '.join(generated_words)

input_text = "ab aur"
predicted_text = generate_poetry(model, input_text, tokenizer)
print(f"Input: {input_text}")
print(f"Predicted: {predicted_text}")

Input: ab aur
Predicted: ab aur jab jab aur aur yuuñ qais yuuñ mire rahā aur yuuñ hue aur liye gaī kab liye tirā aur tirā vahī rahā ai mire aur gae aise ḳhud mujhe ai jī ai maiñ maiñ ai ḳhud dekh gae aise hue ai mujhe aise mujhe dekh tirā sā ḳhud jī kaam
