In [1]:
#pip uninstall nltk
# pip install nltk


In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources if they are not already downloaded

nltk.download('punkt')  # This should include the required sentence tokenizer
nltk.download('stopwords')
nltk.download('punkt_tab')

nltk.download('wordnet')

def preprocess_text(page):
    # Step 1: Remove unnecessary whitespace
    text = page['text'].strip()
    
    # Step 2: Extract page number (if needed)
    page_number = page['page_number']
    
    # Step 3: Tokenization
    sentences = sent_tokenize(text, language="english")
    words = word_tokenize(text)
    
    # Step 4: Lowercasing
    words = [word.lower() for word in words]
    
    # Step 5: Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

    # Step 6: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # Step 7: Remove special characters (this is mostly handled in the filtering above)
    cleaned_text = ' '.join(lemmatized_words)
    
    # Return a dictionary with processed information
    return {
        "page_number": page_number,
        "cleaned_text": cleaned_text,
        "original_text": text,
        "sentences": sentences
    }

# Example of processing a single page
page_data = {
    "page_number": 395,
    "section": None,
    "title": None,
    "text": "364 THE CONSTITUTION OF INDIA\n(Appendix I)\n1 2 3 4 5 6\n107. Chhoto 148 Bhurungamari Dinhata 35.74\nGaraljhora I\n108. 1 chhit without Patgram Mathabhanga 3.5\nname & JL No.\nat the southern\nand of JL No. 38\n& southern and\nof JL No. 39\n(locally known\nas Ashokabari*)\nEnclaves with Fragmented Chhits\n... more text ..."
}

# Run the preprocessing
processed_page = preprocess_text(page_data)
print(processed_page)



[nltk_data] Downloading package punkt to C:\Users\pratham
[nltk_data]     angdalwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\pratham
[nltk_data]     angdalwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\pratham
[nltk_data]     angdalwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\pratham
[nltk_data]     angdalwar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'page_number': 395, 'cleaned_text': '364 constitution india appendix 1 2 3 4 5 6 107 chhoto 148 bhurungamari dinhata garaljhora 108 1 chhit without patgram mathabhanga name jl southern jl 38 southern jl 39 locally known ashokabari enclave fragmented chhits text', 'original_text': '364 THE CONSTITUTION OF INDIA\n(Appendix I)\n1 2 3 4 5 6\n107. Chhoto 148 Bhurungamari Dinhata 35.74\nGaraljhora I\n108. 1 chhit without Patgram Mathabhanga 3.5\nname & JL No.\nat the southern\nand of JL No. 38\n& southern and\nof JL No. 39\n(locally known\nas Ashokabari*)\nEnclaves with Fragmented Chhits\n... more text ...', 'sentences': ['364 THE CONSTITUTION OF INDIA\n(Appendix I)\n1 2 3 4 5 6\n107.', 'Chhoto 148 Bhurungamari Dinhata 35.74\nGaraljhora I\n108.', '1 chhit without Patgram Mathabhanga 3.5\nname & JL No.', 'at the southern\nand of JL No.', '38\n& southern and\nof JL No.', '39\n(locally known\nas Ashokabari*)\nEnclaves with Fragmented Chhits\n... more text ...']}
