In [5]:
# ! pip uninstall nltk
# ! pip install nltk


In [6]:
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(page):
    # Step 1: Remove unnecessary whitespace
    text = page['text'].strip()
    
    # Step 2: Extract page number (if needed)
    page_number = page['page_number']
    
    # Step 3: Tokenization
    sentences = sent_tokenize(text, language="english")
    words = word_tokenize(text)
    
    # Step 4: Lowercasing
    words = [word.lower() for word in words]
    
    # Step 5: Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

    # Step 6: Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # Step 7: Remove special characters (handled in filtering)
    cleaned_text = ' '.join(lemmatized_words)
    
    # Return a dictionary with processed information
    return {
        "page_number": page_number,
        "cleaned_text": cleaned_text,
        "original_text": text,
        "sentences": sentences
    }

# Function to process each page in the JSON file and save the result to a new file
def process_and_save_pages(input_file_path, output_file_path):
    # Load the JSON data
    with open(input_file_path, 'r') as file:
        data = json.load(file)
    
    # Process each page and store the results
    processed_pages = [preprocess_text(page) for page in data]
    
    # Save the cleaned text to a new JSON file
    with open(output_file_path, 'w') as outfile:
        json.dump({"pages": processed_pages}, outfile, indent=4)  # Pretty-print with indent

# Example usage
input_file_path = '../data/data.json'  # Path to your input data.json file
output_file_path = '../data/final_clean.json'  # Path to your output JSON file

process_and_save_pages(input_file_path, output_file_path)

# # Output the processed result of each page (optional, just for verification)
# for page in process_all_pages(input_file_path):
#     print(page)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
