In [None]:
# ===============================
# 📌 Import Necessary Libraries
# ===============================
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
import pandas as pd  # Make sure this is included

# Check GPU availability
print("GPU Available:", tf.config.list_physical_devices('GPU'))


GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ===============================
# 📌 Load & Preprocess Data
# ===============================

# Import the required library
import re

# Load dataset (Update path as needed)
csv_file_path = '/content/Roman-Urdu-Poetry.csv'
df = pd.read_csv(csv_file_path)
data = df['Poetry'].dropna().tolist()

# Normalize text (convert special characters)
def normalize_text(text):
    # Mapping for special characters to normalize them
    char_map = {'ġ': 'g', 'ḳ': 'k', 'ñ': 'n', 'ā': 'a', 'ī': 'i', 'ū': 'u',
                'ṣ': 's', 'ḥ': 'h', 'ṭ': 't', 'ḍ': 'd', 'ṅ': 'n', 'ṇ': 'n', 'ṁ': 'm'}

    # Replace special characters based on the char_map
    for special_char, replacement in char_map.items():
        text = text.replace(special_char, replacement)

    # Replace '-e-' with a special token '<e_token>'
    text = text.replace('-e-', ' <e_token> ')

    # Remove all punctuation (except '<e_token>')
    text = re.sub(r'[^\w\s<e_token>]', '', text)

    # Convert text to lowercase for consistency
    text = text.lower()

    return text

# Apply normalization to each line of data
data = [normalize_text(line) for line in data]

# Print example data
print("Example Preprocessed Line:", data[0])


Example Preprocessed Line: aankh se duur na ho dil se utar jaega 
vaqt ka kya hai guzarta hai guzar jaega 
itna manus na ho khalvat <e_token> gham se apni 
tu kabhi khud ko bhi dekhega to dar jaega 
dubte dubte kashti ko uchhala de duun 
main nahin koi to sahil pe utar jaega 
zindagi teri ata hai to ye jaane vaala 
teri bakhshish tiri dahliz pe dhar jaega 
zabt lazim hai magar dukh hai qayamat ka faraz 
zalim ab ke bhi na roega to mar jaega


In [None]:
# ===============================
# 📌 Tokenization & Sequences
# ===============================

# Tokenize dataset
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data)
vocab_size = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Reduce sequence length (max 50 words)
max_seq_len = min(100, max(len(seq) for seq in input_sequences))
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Split into X (input) and y (labels)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print("Tokenized Vocabulary Size:", vocab_size)
print("Max Sequence Length:", max_seq_len)
print("Shape of X (predictors):", X.shape)
print("Shape of y (labels):", y.shape)


Tokenized Vocabulary Size: 12400
Max Sequence Length: 100
Shape of X (predictors): (182650, 99)
Shape of y (labels): (182650,)


In [None]:
# ===============================
# 📌 Improved Training Setup
# ===============================

# Define Callbacks with more patience for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.001, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-5)

# Modify model to have more LSTM units and layers
model = tf.keras.Sequential([
    Embedding(vocab_size, 128, input_length=max_seq_len - 1),  # Increase embedding size
    LSTM(256, return_sequences=True),  # Increase LSTM units
    Dropout(0.3),
    LSTM(256),  # Second LSTM layer
    Dropout(0.3),
    Dense(256, activation='relu'),  # Additional Dense layer
    Dropout(0.3),
    Dense(vocab_size, activation='softmax')  # Output layer prob
])

# Define custom perplexity metric
def perplexity(y_true, y_pred):
    cross_entropy = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    return tf.exp(tf.reduce_mean(cross_entropy))

# Compile the model with a smaller learning rate for smoother optimization
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(learning_rate=0.0005),  # Slightly lower learning rate
              metrics=[perplexity])


In [None]:
# ===============================
# 📌 Train Model with Callbacks
# ===============================

# Train model for more epochs
history = model.fit(X, y,
                    epochs=20,  # Train for more epochs
                    batch_size=16,
                    validation_split=0.2,
                    callbacks=[reduce_lr])


Epoch 1/20
[1m9133/9133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 18ms/step - loss: 6.5426 - perplexity: 1107.4093 - val_loss: 6.6804 - val_perplexity: 1268.2993 - learning_rate: 5.0000e-04
Epoch 2/20
[1m9133/9133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 19ms/step - loss: 6.1104 - perplexity: 640.9030 - val_loss: 6.7324 - val_perplexity: 1647.1924 - learning_rate: 5.0000e-04
Epoch 3/20
[1m9133/9133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 18ms/step - loss: 5.9964 - perplexity: 538.2963 - val_loss: 6.7698 - val_perplexity: 1899.1880 - learning_rate: 5.0000e-04
Epoch 4/20
[1m9133/9133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 18ms/step - loss: 5.9307 - perplexity: 510.6794 - val_loss: 6.7586 - val_perplexity: 2069.5815 - learning_rate: 5.0000e-04
Epoch 5/20
[1m9133/9133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 18ms/step - loss: 5.8518 - perplexity: 482.3285 - val_loss: 6.7785 - val_perplexity: 2246.6189 - learn

In [None]:
# ===============================
# 📌 Evaluate Model Perplexity
# ===============================
loss, ppl = model.evaluate(X, y, verbose=0)
print("Final Loss:", loss)
print("Final Perplexity:", ppl)


Final Loss: 5.68950891494751
Final Perplexity: 681.3063354492188


In [None]:
import numpy as np
import random

def generate_text(seed_word, next_words, max_seq_len, temperature=0.8):
    """
    Generates two related verses from a single input word.
    """

    def generate_verse(seed_text, next_words, max_seq_len, temperature):
        """
        Generates a single verse using temperature-based sampling.
        """
        for _ in range(next_words):
            # Tokenize and pad sequence
            token_list = tokenizer.texts_to_sequences([seed_text])[0]
            token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')

            # Predict next word probabilities
            predicted_probs = model.predict(token_list, verbose=0)[0]

            # Apply temperature-based sampling
            predicted_probs = np.log(predicted_probs + 1e-8) / temperature
            predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))

            # Sample word using probability distribution
            predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)
            output_word = tokenizer.index_word.get(predicted_index, "")

            # Append the predicted word to seed text
            seed_text += " " + output_word

        return seed_text

    # Generate the first verse
    verse1 = generate_verse(seed_word, next_words, max_seq_len, temperature)

    # Use the last few words of the first verse as a seed for the second verse
    last_words = " ".join(verse1.split()[0:])  # Taking last 3 words as seed
    verse2 = generate_verse(last_words, next_words, max_seq_len, temperature)

    return verse1, verse2



In [67]:

# Example Usage
seed_word = "ishq"
verse1, verse2 = generate_text(seed_word, next_words=10, max_seq_len=15, temperature=0.8)


print("Generated Verse 2:", verse2)


Generated Verse 2: ishq liye milta nahin dillagi talak jaan bana kahte hain ki kami na kabhi hua ham se ye ek dil karna


In [None]:
# ===============================
# 📌 Save Model (TensorFlow Format)
# ===============================

# Save the model using the `.keras` extension (Recommended)
model_save_path = "/content/roman_urdu_poetry_model.keras"
model.save(model_save_path)

print(f"Model saved successfully at {model_save_path}")


Model saved successfully at /content/roman_urdu_poetry_model.keras


In [68]:
import pickle

# Assuming `tokenizer` is already trained
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
