In [7]:
import PyPDF2
import re

# Load the PDF
pdf_path = "rekhtaData.pdf"

with open(pdf_path, "rb") as f:
    reader = PyPDF2.PdfReader(f)
    text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])  # Extract text

# Debugging Step: Print some extracted text
print("Extracted Text (First 500 chars):\n", text[:500])  

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Cleaned text
cleaned_text = clean_text(text)

# Save cleaned data
with open("cleaned_poetry.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Sample Cleaned Text (First 500 chars):\n", cleaned_text[:500])


Extracted Text (First 500 chars):
 ID Poet Poetry
1ahmad-faraz aañkh  se duur na ho dil se utar jā.egā
vaqt kā kyā hai guzartā  hai guzar  jā.egā
itnā mānūs  na ho ḳhalvat-e-ġham  se apnī
tū kabhī  ḳhud  ko bhī dekhegā  to Dar jā.egā
Dūbte  Dūbte  kashtī  ko uchhālā  de duuñ
maiñ  nahīñ  koī to sāhil pe utar jā.egā
zindagī  terī atā hai to ye jaane  vaalā
terī baḳhshish  tirī dahlīz  pe dhar jā.egā
zabt lāzim  hai magar  dukh  hai qayāmat  kā 'farāz'
zālim  ab ke bhī na ro.egā  to mar jā.egā2ahmad-faraz āshiqī  meñ 'mīr' jaise ḳh
Sample Cleaned Text (First 500 chars):
 id poet poetry ahmadfaraz aañkh se duur na ho dil se utar jāegā vaqt kā kyā hai guzartā hai guzar jāegā itnā mānūs na ho ḳhalvateġham se apnī tū kabhī ḳhud ko bhī dekhegā to dar jāegā dūbte dūbte kashtī ko uchhālā de duuñ maiñ nahīñ koī to sāhil pe utar jāegā zindagī terī atā hai to ye jaane vaalā terī baḳhshish tirī dahlīz pe dhar jāegā zabt lāzim hai magar dukh hai qayāmat kā farāz zālim ab ke bhī na roegā to mar jāegā

In [9]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned_text])
total_words = len(tokenizer.word_index) + 1  # +1 for padding token

# Convert Text to Sequences
input_sequences = []
words = cleaned_text.split()

for i in range(1, min(5000, len(words))):  # Limit to 5000 words for efficiency
    n_gram_sequence = words[:i+1]  # Create n-grams
    encoded = tokenizer.texts_to_sequences([" ".join(n_gram_sequence)])[0]
    input_sequences.append(encoded)

# Reduce Max Sequence Length to 50
max_sequence_length = min(50, max([len(seq) for seq in input_sequences]))

# Padding Sequences
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding="pre")

# Split into Features (X) and Labels (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)

# Print some stats
print(f"Total Words: {total_words}")
print(f"Max Sequence Length: {max_sequence_length}")
print(f"Number of Training Samples: {len(X)}")


Total Words: 18063
Max Sequence Length: 50
Number of Training Samples: 4999


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

# Define GRU Model
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_length - 1),  # Word Embeddings
    GRU(256, return_sequences=True),  # First GRU layer
    GRU(256),  # Second GRU layer
    Dense(total_words, activation="softmax")  # Output layer
])

# Compile Model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the Model
epochs = 50  # You can increase for better results
history = model.fit(X, y, epochs=epochs, verbose=1)

# Save the Model  
model.save("urdu_poetry_model.h5")

print("🎉 Model training complete and saved as 'urdu_poetry_model.h5'!")




Epoch 1/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 213ms/step - accuracy: 0.0337 - loss: 8.1534
Epoch 2/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 206ms/step - accuracy: 0.0334 - loss: 6.2434
Epoch 3/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 218ms/step - accuracy: 0.0323 - loss: 6.0906
Epoch 4/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 224ms/step - accuracy: 0.0362 - loss: 6.0626
Epoch 5/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 208ms/step - accuracy: 0.0247 - loss: 5.9701
Epoch 6/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m588s[0m 4s/step - accuracy: 0.0331 - loss: 5.9300
Epoch 7/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 222ms/step - accuracy: 0.0359 - loss: 5.8364
Epoch 8/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 201ms/step - accuracy: 0.0419 - loss: 5.7401
Epoch 9/50
[1m157

In [27]:
import numpy as np

def generate_poetry(seed_text, next_words=20, temperature=0.8):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding="pre")
        
        predictions = model.predict(token_list)[0]
        predictions = np.log(predictions + 1e-7) / temperature  # Apply temperature scaling
        exp_preds = np.exp(predictions)
        probabilities = exp_preds / np.sum(exp_preds)

        predicted = np.random.choice(len(probabilities), p=probabilities)  # Sample from the probability distribution
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text

# Example usage
print(generate_poetry("aankh se door", next_words=20, temperature=0.5))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [29]:
!pip install gradio


Collecting gradio
  Using cached gradio-5.16.0-py3-none-any.whl.metadata (16 kB)
Using cached gradio-5.16.0-py3-none-any.whl (62.2 MB)
Installing collected packages: gradio
Successfully installed gradio-5.16.0
