In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Loading and preprocess the data
data_file = "Roman-Urdu-Poetry.csv"
data = pd.read_csv(data_file)

# Cleaning the dataset
def clean_text(text):
    # Removing non-Roman Urdu characters and extra spaces
    text = re.sub(r"[^a-zA-Z0-9\s'.,!?\-]", "", text)
    text = re.sub(r"\s+", " ", text).strip()  # Removing extra spaces
    return text

data["Poetry"] = data["Poetry"].apply(clean_text)
text = " ".join(data["Poetry"].dropna().tolist())

# Tokenizing the text into words
words = text.split()

# Creating vocabulary without filtering
word_counts = Counter(words)
vocab = list(word_counts.keys())
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # +1 for padding index
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(word2idx) + 1  # +1 for padding index

# Creating sequences
seq_length = 20  # Fixed sequence length
sequences = []
for i in range(len(words) - seq_length):
    seq = words[i:i + seq_length + 1]  # +1 to include the target word
    sequences.append(seq)

# Converting sequences to numerical format
numerical_sequences = []
for seq in sequences:
    numerical_sequences.append([word2idx.get(word, 0) for word in seq])  # Use 0 for unknown words

# Pad sequences
padded_sequences = np.zeros((len(numerical_sequences), seq_length + 1), dtype=np.int32)
for i, seq in enumerate(numerical_sequences):
    padded_sequences[i, -len(seq):] = seq

# Splitting predictors (X) and labels (y)
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

# Splitting into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Converting data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_val = torch.tensor(X_val, dtype=torch.long).to(device)
y_val = torch.tensor(y_val, dtype=torch.long).to(device)



Using device: cuda


In [19]:
# Defining the dataset and data loader
class PoetryDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = PoetryDataset(X_train, y_train)
val_dataset = PoetryDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Defining the LSTM model
class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_rate=0.3):
        super(PoetryLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=2, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Use the output of the last time step
        return output

embedding_dim = 256
hidden_dim = 512
output_dim = vocab_size
dropout_rate = 0.3  # Increased dropout for better regularization
model = PoetryLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, dropout_rate).to(device)



In [20]:
# # Defining loss and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)  # Added weight decay
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

# # Training the model
# def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
#     best_val_loss = float('inf')
#     for epoch in range(epochs):
#         model.train()
#         train_loss = 0
#         for X_batch, y_batch in train_loader:
#             optimizer.zero_grad()
#             output = model(X_batch)
#             loss = criterion(output, y_batch)
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
#             optimizer.step()
#             train_loss += loss.item()

#         model.eval()
#         val_loss = 0
#         with torch.no_grad():
#             for X_batch, y_batch in val_loader:
#                 output = model(X_batch)
#                 loss = criterion(output, y_batch)
#                 val_loss += loss.item()

#         # Learning rate scheduling
#         scheduler.step(val_loss)

#         print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")

#         # Early stopping
#         if val_loss < best_val_loss:
#             best_val_loss = val_loss
#             torch.save(model.state_dict(), "best_model.pth")  # Save the best model

# train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)



In [26]:
####
# Function to save training state
def save_checkpoint(epoch, model, optimizer, scheduler, best_val_loss, filename="checkpoint.pth"):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict(),
        "best_val_loss": best_val_loss,
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved at epoch {epoch+1}")

# Function to load training state
def load_checkpoint(model, optimizer, scheduler, filename="checkpoint.pth"):
    try:
        checkpoint = torch.load(filename)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
        start_epoch = checkpoint["epoch"] + 1  # Start from the next epoch
        best_val_loss = checkpoint["best_val_loss"]
        print(f"Checkpoint loaded, resuming from epoch {start_epoch}")
        return start_epoch, best_val_loss
    except FileNotFoundError:
        print("No checkpoint found, starting from scratch.")
        return 0, float("inf")  # Start from epoch 0 if no checkpoint exists


# Load previous training state if available
start_epoch, best_val_loss = load_checkpoint(model, optimizer, scheduler)

# Resume training from the last saved epoch
for epoch in range(start_epoch, 22):  # Ensures training starts from where it left
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            output = model(X_batch)
            loss = criterion(output, y_batch)
            val_loss += loss.item()

    # Learning rate scheduling
    scheduler.step(val_loss)

    print(f"Epoch {epoch + 1}/20, Train Loss: {train_loss / len(train_loader):.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")

    # # Save progress **only if validation loss improves**
    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    save_checkpoint(epoch, model, optimizer, scheduler, best_val_loss)


  checkpoint = torch.load(filename)


Checkpoint loaded, resuming from epoch 20
Epoch 21/20, Train Loss: 2.9005, Validation Loss: 6.5780
Checkpoint saved at epoch 21
Epoch 22/20, Train Loss: 2.9001, Validation Loss: 6.5780
Checkpoint saved at epoch 22


In [22]:
# Generating poetry with temperature sampling
def generate_poetry(seed_text, next_words=20, temperature=1.0):
    model.eval()
    words = seed_text.split()
    for _ in range(next_words):
        tokenized_input = [word2idx.get(word, 0) for word in words[-seq_length:]]  # Use last `seq_length` words
        input_seq = torch.tensor(tokenized_input, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_seq)
        output_dist = output.div(temperature).exp()  # Apply temperature
        predicted_idx = torch.multinomial(output_dist, num_samples=1).item()  # Sample from the distribution
        predicted_word = idx2word.get(predicted_idx, "")
        if predicted_word == "":  # Skiping unknown words
            continue
        words.append(predicted_word)
    return " ".join(words)

#  Generate poetry with temperature sampling
seed_text = "boss"
print("Generated Poetry:")
print(generate_poetry(seed_text, next_words=20, temperature=0.8))  # Adjusting temperature for diversity



Generated Poetry:
boss ham-kalm fitrat-e-ins pakane chhiye be-savd kab se khilaun, k dvat-e-mizhg hai nahma na jaan kare ko mad-e-hat me ho pur-shob-o-pur-hatar


In [23]:
# Evaluate the model
def calculate_perplexity(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    total_words = 0
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            output = model(X_batch)
            loss = criterion(output, y_batch)
            total_loss += loss.item() * X_batch.size(0)
            total_words += X_batch.size(0)
    perplexity = torch.exp(torch.tensor(total_loss / total_words))
    return perplexity.item()

val_perplexity = calculate_perplexity(model, val_loader, criterion)
print(f"Validation Perplexity: {val_perplexity:.4f}")

Validation Perplexity: 719.9183


In [24]:
pip install gradio




In [25]:
import gradio as gr
import torch
import torch.nn as nn
import numpy as np

# Loading necessary components from training script
vocab_size = len(word2idx) + 1  # +1 for padding index
seq_length = 20  # Ensure it matches training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Defining the model class
class PoetryLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_rate=0.3):
        super(PoetryLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=2, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

# Loading the trained model
embedding_dim = 256
hidden_dim = 512
dropout_rate = 0.3
model = PoetryLSTM(vocab_size, embedding_dim, hidden_dim, vocab_size, dropout_rate).to(device)
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.eval()

# Poetry generation function
def generate_poetry(seed_text, next_words=20, temperature=1.0):
    words = seed_text.split()
    for _ in range(next_words):
        tokenized_input = [word2idx.get(word, 0) for word in words[-seq_length:]]
        input_seq = torch.tensor(tokenized_input, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_seq)
        output_dist = output.div(temperature).exp()
        predicted_idx = torch.multinomial(output_dist, num_samples=1).item()
        predicted_word = idx2word.get(predicted_idx, "")
        if predicted_word == "":
            continue
        words.append(predicted_word)
    return " ".join(words)

# Gradio UI
gram_ui = gr.Interface(
    fn=generate_poetry,
    inputs=[
        gr.Textbox(label="Enter a seed word or phrase"),
        gr.Slider(5, 50, value=20, step=1, label="Number of words"),
        gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Temperature")
    ],
    outputs="text",
    title="Roman Urdu Poetry Generator",
    description="Enter a seed word to generate poetry based on an LSTM model trained on Roman Urdu poetry.",
)

gram_ui.launch(share=True)


  model.load_state_dict(torch.load("best_model.pth", map_location=device))


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://211d7f315358a73d2c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


