## Remove HTML

In [6]:
from bs4 import BeautifulSoup
import unicodedata
import re
import json

In [None]:
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def keep_urdu(text):
    return re.sub(r"[^\u0600-\u06FF\u0660-\u0669\s۔،!?]", "", text)  # Include Urdu digits


def normalize_unicode(text):
    text = unicodedata.normalize("NFC", text)
    text = text.replace("ي", "ی")
    text = text.replace("ك", "ک")
    return text


def clean_spacing(text):
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def add_special_tokens(text):
    """
    Adds <EOS> at the end of each sentence,
    <EOP> at the end of each paragraph, and
    <EOT> at the end of the text.
    """
    EOS = "\u0003"   # End of sentence
    EOP = "\u0004"   # End of paragraph
    EOT = "\u0005"   # End of story

    # Add EOS after sentence-ending punctuation
    text = re.sub(r"([۔؟!])", r"\1 " + EOS, text)

    # Handle ellipsis (۔۔۔)
    text = re.sub(r"(۔۔۔)", r"\1 " + EOS, text)

    # Paragraph breaks → EOP
    text = text.replace("\n", " " + EOP + " ")

    # Clean spacing
    text = re.sub(r"\s+", " ", text).strip()

    # End of story
    text += " " + EOT

    return text

def preprocess_text(text):
    text = remove_html(text)
    text = keep_urdu(text)
    text = normalize_unicode(text)
    text = clean_spacing(text)
    text = add_special_tokens(text)
    return text

In [13]:
with open("../scrapper/scrapped-stories/stories.json", "r", encoding="utf-8") as f:
    data = json.load(f)

preprocessed_data = []
for item in data:
    preprocessed_text = preprocess_text(item["content"])
    preprocessed_data.append({
        "title": item["title"],
        "story": preprocessed_text
    })

# Store all the preprocessed stories in a new text file for training
with open("preprocessed_stories.txt", "w", encoding="utf-8") as f:
    for item in preprocessed_data:
        f.write(item["story"] + "\n")  # Each story on a new line
        print(f"Preprocessed story: {item['title']}")
    
    print(f"Total preprocessed stories: {len(preprocessed_data)}")
print("Preprocessing completed. Preprocessed stories saved to 'preprocessed_stories.txt'.")

Preprocessed story: Gharoor Ki Saza - Article No. 2922
Preprocessed story: Jellyfish Loomi Ki Kahani - Article No. 2921
Preprocessed story: Purisrar Boorha - Article No. 2919
Preprocessed story: Naya Azm - Article No. 2918
Preprocessed story: Ghar Mein Mujrim - Article No. 2917
Preprocessed story: Jannat Ka Rasta - Article No. 2916
Preprocessed story: Bhooton Ka Naam - Article No. 2915
Preprocessed story: Imandari Ka Inaam - Article No. 2914
Preprocessed story: Reham Ka Sila - Article No. 2913
Preprocessed story: Hiran Ki Laparwahi - Article No. 2911
Preprocessed story: Himmat - Article No. 2910
Preprocessed story: Karo Meharbani Tum Ahle Zameen Par - Article No. 2909
Preprocessed story: Panda Aur Smart Phone - Article No. 2908
Preprocessed story: Qalam Ki Khwahish - Article No. 2907
Preprocessed story: Anmol Dosti - Article No. 2906
Preprocessed story: Nani Ka Saman - Article No. 2905
Preprocessed story: Khali Pinjra - Article No. 2904
Preprocessed story: Taqatwar Bewaqoof - Article N