In [None]:
#step 1 load data 
#step 2 chose a archtiture of LSTM (Bidirectional Stacked)
#step 3 Train 
#step 4 test 

In [1]:
import pandas as pd
df=pd.read_csv("/kaggle/input/poetrydataset/Roman-Urdu-Poetry.csv")
df.head()

Unnamed: 0,ID,Poet,Poetry
0,1,ahmad-faraz,aañkh se duur na ho dil se utar jā.egā \nvaqt ...
1,2,ahmad-faraz,āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \n...
2,3,ahmad-faraz,ab aur kyā kisī se marāsim baḌhā.eñ ham \nye b...
3,4,ahmad-faraz,ab ke ham bichhḌe to shāyad kabhī ḳhvāboñ meñ ...
4,5,ahmad-faraz,ab ke tajdīd-e-vafā kā nahīñ imkāñ jānāñ \nyaa...


In [16]:
x=df[['Poetry'][0]]
x[0]

"aañkh se duur na ho dil se utar jā.egā \nvaqt kā kyā hai guzartā hai guzar jā.egā \nitnā mānūs na ho ḳhalvat-e-ġham se apnī \ntū kabhī ḳhud ko bhī dekhegā to Dar jā.egā \nDūbte Dūbte kashtī ko uchhālā de duuñ \nmaiñ nahīñ koī to sāhil pe utar jā.egā \nzindagī terī atā hai to ye jaane vaalā \nterī baḳhshish tirī dahlīz pe dhar jā.egā \nzabt lāzim hai magar dukh hai qayāmat kā 'farāz' \nzālim ab ke bhī na ro.egā to mar jā.egā"

In [17]:
x[1]

"āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \nbāvle ho jāoge mahtāb mat dekhā karo \njasta jasta paḌh liyā karnā mazāmīn-e-vafā \npar kitāb-e-ishq kā har baab mat dekhā karo \nis tamāshe meñ ulaT jaatī haiñ aksar kashtiyāñ \nDūbne vāloñ ko zer-e-āb mat dekhā karo \nmai-kade meñ kyā takalluf mai-kashī meñ kyā hijāb \nbazm-e-sāqī meñ adab ādāb mat dekhā karo \nham se durveshoñ ke ghar aao to yāroñ kī tarah \nhar jagah ḳhas-ḳhāna o barfāb mat dekhā karo \nmāñge-tāñge kī qabā.eñ der tak rahtī nahīñ \nyaar logoñ ke laqab-alqāb mat dekhā karo \ntishnagī meñ lab bhigo lenā bhī kaafī hai 'farāz' \njaam meñ sahbā hai yā zahrāb mat dekhā karo"

In [22]:
df['Poet'].unique()

array(['ahmad-faraz', 'akbar-allahabadi', 'allama-iqbal',
       'altaf-hussain-hali', 'ameer-khusrau', 'bahadur-shah-zafar',
       'dagh-dehlvi', 'fahmida-riaz', 'faiz-ahmad-faiz',
       'firaq-gorakhpuri', 'gulzar', 'habib-jalib', 'jaan-nisar-akhtar',
       'jaun-eliya', 'javed-akhtar', 'jigar-moradabadi', 'kaifi-azmi',
       'meer-anees', 'meer-taqi-meer', 'mirza-ghalib', 'mohsin-naqvi',
       'naji-shakir', 'naseer-turabi', 'nazm-tabatabai', 'nida-fazli',
       'noon-meem-rashid', 'parveen-shakir', 'sahir-ludhianvi',
       'wali-mohammad-wali', 'waseem-barelvi'], dtype=object)

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Hyperparameters
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_LAYERS = 2
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 10
SEQ_LENGTH = 10

# Load dataset
df = pd.read_csv("/kaggle/input/poetrydataset/Roman-Urdu-Poetry.csv")  # Update with the correct file path
poems = df["Poetry"].tolist()
poets = df["Poet"].tolist()

# Tokenize and build vocabulary
words = set()
for poem in poems:
    words.update(poem.split())
word_to_idx = {word: i for i, word in enumerate(words)}
idx_to_word = {i: word for word, i in word_to_idx.items()}
poet_to_idx = {poet: i for i, poet in enumerate(set(poets))}

# Poetry Dataset
class PoetryDataset(Dataset):
    def __init__(self, poems, poets, word_to_idx, poet_to_idx, seq_length=SEQ_LENGTH):
        self.poems = poems
        self.poets = poets
        self.word_to_idx = word_to_idx
        self.poet_to_idx = poet_to_idx
        self.seq_length = seq_length
        
        self.data = []
        for poem, poet in zip(poems, poets):
            encoded_poem = [word_to_idx[word] for word in poem.split() if word in word_to_idx]
            poet_idx = poet_to_idx[poet]
            for i in range(len(encoded_poem) - seq_length):
                self.data.append((encoded_poem[i:i+seq_length], encoded_poem[i+1:i+seq_length+1], poet_idx))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x, y, poet = self.data[idx]
        return torch.tensor(x), torch.tensor(y), torch.tensor(poet)

# Define LSTM Model
class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, poet_count, embedding_dim, hidden_dim, num_layers):
        super(PoetryLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.poet_embedding = nn.Embedding(poet_count, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim * 2, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, poet):
        word_embeds = self.embedding(x)  # (batch, seq_len, embed_dim)
        poet_embeds = self.poet_embedding(poet).unsqueeze(1).repeat(1, x.size(1), 1)  # (batch, seq_len, embed_dim)
        combined = torch.cat((word_embeds, poet_embeds), dim=2)
        lstm_out, _ = self.lstm(combined)
        out = self.fc(lstm_out)
        return out

# Create Dataset and Dataloader
dataset = PoetryDataset(poems, poets, word_to_idx, poet_to_idx)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Model Initialization
vocab_size = len(word_to_idx)
poet_count = len(poet_to_idx)
model = PoetryLSTM(vocab_size, poet_count, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# Training Function
def train_model(model, dataloader, optimizer, criterion, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x, y, poet in dataloader:
            x, y, poet = x.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")), y.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")), poet.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            optimizer.zero_grad()
            output = model(x, poet)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader)}")

# Train Model
train_model(model, dataloader, optimizer, criterion, EPOCHS)


Epoch 1/10, Loss: 5.873229513393278
Epoch 2/10, Loss: 3.8157709709102967
Epoch 3/10, Loss: 2.595785294949982
Epoch 4/10, Loss: 1.844647487243335
Epoch 5/10, Loss: 1.3721477499154604
Epoch 6/10, Loss: 1.0810341620026136
Epoch 7/10, Loss: 0.9025863067383335
Epoch 8/10, Loss: 0.7923283351226335
Epoch 9/10, Loss: 0.7198173087546175
Epoch 10/10, Loss: 0.6690258616917283


In [25]:
def generate_poetry(model, start_words, poet_name, max_length=50):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Convert poet name to index
    if poet_name not in poet_to_idx:
        print("Poet not found in dataset.")
        return ""
    poet_idx = torch.tensor([poet_to_idx[poet_name]], device=device)
    
    # Convert start words to indices
    input_indices = [word_to_idx[word] for word in start_words.split() if word in word_to_idx]
    if not input_indices:
        print("No valid words found in vocabulary.")
        return ""
    
    input_tensor = torch.tensor(input_indices, dtype=torch.long, device=device).unsqueeze(0)  # Add batch dim

    generated_words = start_words.split()
    
    for _ in range(max_length):
        with torch.no_grad():
            output = model(input_tensor, poet_idx)  # Forward pass
            predictions = output[:, -1, :]  # Get last token's predictions
            next_word_idx = torch.argmax(predictions, dim=1).item()  # Choose the most probable word
            
            if next_word_idx in idx_to_word:
                next_word = idx_to_word[next_word_idx]
                generated_words.append(next_word)
                input_tensor = torch.cat((input_tensor, torch.tensor([[next_word_idx]], device=device)), dim=1)
            else:
                break  # Stop if an invalid word is predicted

    return " ".join(generated_words)


In [26]:
poet_name = "ahmad-faraz"
start_words = "aañkh se duur"
generated_poem = generate_poetry(model, start_words, poet_name)
print("Generated Poem:\n", generated_poem)


Generated Poem:
 aañkh se duur koī nahīñ khultā būñdā-bāndī bhī dhuup bhī hai abhī ḳhud-kalāmī meñ kab ye nashsha thā jis tarah rū-ba-rū koī hai abhī qurbateñ laakh ḳhūb-sūrat hoñ dūriyoñ meñ bhī dilkashī hai abhī fasl-e-gul meñ bahār pahlā gulāb kis kī zulfoñ meñ Tāñktī hai abhī muddateñ ho ga.iiñ 'farāz' magar vo jo


In [43]:
poet_name = "allama-iqbal"
start_words = "pyaar"
generated_poem = generate_poetry(model, start_words, poet_name)
print("Generated Poem:\n", generated_poem)


Generated Poem:
 pyaar thā kisī darmāñda rah-rau kī sadā-e-dardnāk jis ko āvāz-e-rahīl-e-kārvāñ samjhā thā maiñ kah ga.iiñ rāz-e-mohabbat parda-dārī-hā-e-shauq thī fuġhāñ vo bhī jise zabt-e-fuġhāñ samjhā thā maiñ thī kisī darmāñda rah-rau kī sadā-e-dardnāk jis ko āvāz-e-rahīl-e-kārvāñ samjhā thā maiñ kah ga.iiñ rāz-e-mohabbat parda-dārī-hā-e-shauq thī fuġhāñ vo bhī jise zabt-e-fuġhāñ samjhā thā maiñ


In [42]:
poet_name = "faiz-ahmad-faiz"
start_words = "pyaar"
generated_poem = generate_poetry(model, start_words, poet_name)
print("Generated Poem:\n", generated_poem)


Generated Poem:
 pyaar kareñ na jaane kātib-e-vaqt ne kisī apne kal meñ bhī bhuul kar liyā vo mire yaad aa gayā 'faiz' kyā jāniye yaar kis aas par muntazir haiñ ki lā.egā koī ḳhabar mai-kashoñ par huā mohtasib mehrbāñ dil-figāroñ pe qātil ko pyaar aa gayā sub.h-e-fardā ko phir dil tarasne lagā umr-e-rafta


In [36]:
torch.save(model.state_dict(), "poetry_lstm.pth")
print("Model saved as poetry_lstm.pth")


Model saved as poetry_lstm.pth


In [40]:
#Now generate a blog related to this output