In [None]:

import pandas as pd
import numpy as np
import pickle
import re
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

os.makedirs('../data/processed', exist_ok=True)

#  Text cleaning and preprocessing for Urdu poetry dataset
print(" Step 2.1: Loading and Cleaning Text...")

# Loading data from Step 1
df = pd.read_csv('../data/urdu_poetry_raw.csv')

def clean_urdu_text(text):
    if not isinstance(text, str):
        return ""
    
    #  Normalize Urdu characters (Unifying different unicode forms)
    # Example: Fixing 'Hamza' or different forms of 'Yaa' and 'Kaaf'
    text = text.replace('ŸÉ', '⁄©')  # Arabic Kaaf to Urdu Kaaf
    text = text.replace('Ÿä', '€å')  # Arabic Yaa to Urdu Yaa
    text = text.replace('Ÿâ', '€å')
    text = text.replace('€Å', '€Å')  # Normalizing He
    text= text.replace('€Ä', '€Å') # He with Hamza to He
    text = text.replace('ÿ§', 'Ÿà')  # Waw with Hamza to Waw
    text= text.replace('ÿ•', 'ÿß')  # Alef with Hamza below to Alef

    # Removing English characters and numbers 
    text = re.sub(r'[a-zA-Z0-9]', '', text)
    
    #  Removing special symbols but keep sentence terminators if needed
    # Removing strict punctuation to focus on word flow
    text = re.sub(r'[^\w\s]', '', text)
    
    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['cleaned_text'] = df['content'].apply(clean_urdu_text)

# Removing empty rows after cleaning
df = df[df['cleaned_text'] != ""]
print(f" Text Cleaned. Remaining Poems: {len(df)}")

#  TOKENIZATION 
print("\n Step 2.2: Tokenization...")

# Combining all poems into a corpus list
corpus = df['cleaned_text'].tolist()

# Initializing Tokenizer
# We don't set a num_words limit to capture full richness, or limit to top 5000 if OOM errors occur later.
tokenizer = Tokenizer(oov_token='<OOV>') #Tokenization helps to convert text into sequences of integers, which are easier for models to process. and result is a dictionary mapping words(key) to unique integers(values).
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1  # +1 for padding
print(f" Vocabulary Size (Total Words): {total_words}")

# GENERATING N-GRAM SEQUENCES
print("\n Step 2.3: Generating Input Sequences (N-grams)...")

input_sequences = []
#To generate n-gram sequences we will iterate through each line in the corpus and create sequences by progressively adding one word at a time. So
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(f"Total Sequences Generated: {len(input_sequences)}")

# PADDING SEQUENCES
print("\n Step 2.4: Padding Sequences...")

# PROJECT BASELINE: Sequence Length = 20 

BASELINE_SEQ_LEN = 20 

print(f"Enforcing Baseline Sequence Length: {BASELINE_SEQ_LEN}")

# Pad sequences to uniform length of 20
# truncating='pre' means if a poem is longer than 20, we keep the LAST 20 words
input_sequences = np.array(pad_sequences(input_sequences, 
                                         maxlen=BASELINE_SEQ_LEN, 
                                         padding='pre', 
                                         truncating='pre'))


# Create Predictors (X) and Label (y)
X = input_sequences[:, :-1] # All but last token
y = input_sequences[:, -1] # Last token


print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# SPLITTING DATA
print("\nüîµ Step 2.5: Splitting Data (80/10/10)...")

# First split: 80% Train, 20% Temp (Val + Test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: Split the 20% Temp into 50% Val and 50% Test (results in 10% each of total)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training Set: {X_train.shape}")
print(f"Validation Set: {X_val.shape}")
print(f"Test Set:      {X_test.shape}")

# SAVING PROCESSED DATA
print("\n Saving Processed Data...")

# Saving the tokenizer and data arrays
with open('../models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


np.savez('../data/processed/ready_data.npz', 
         X_train=X_train, y_train=y_train,
         X_val=X_val, y_val=y_val,
         X_test=X_test, y_test=y_test,
         max_sequence_len=BASELINE_SEQ_LEN,
         total_words=total_words)

print(" Step 2 Complete. Tokenizer and Data saved successfully.")

üîµ Step 2.1: Loading and Cleaning Text...
‚úÖ Text Cleaned. Remaining Poems: 1314

üîµ Step 2.2: Tokenization...
‚úÖ Vocabulary Size (Total Words): 10225

üîµ Step 2.3: Generating Input Sequences (N-grams)...
Total Sequences Generated: 171897

üîµ Step 2.4: Padding Sequences...
Enforcing Baseline Sequence Length: 20
Shape of X: (171897, 19)
Shape of y: (171897,)

üîµ Step 2.5: Splitting Data (80/10/10)...
Training Set: (137517, 19)
Validation Set: (17190, 19)
Test Set:      (17190, 19)

üíæ Saving Processed Data...
‚úÖ Step 2 Complete. Tokenizer and Data saved successfully.
