In [6]:
import re
import nltk
import spacy
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from tqdm import tqdm
import json

In [7]:
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()
    
    # 2. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # 3. Remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    
    # 4. Remove numbers
    text = re.sub(r"\d+", "", text)
    
    # 5. Tokenization
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_space]
    
    # 6. Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = []
    for token in tqdm(tokens, desc="Removing Stopwords"):
        if token not in stop_words:
            filtered_tokens.append(token)
    tokens = filtered_tokens
    
    # 7. Spell Checking and Correction
    corrected_tokens = []
    for token in tqdm(tokens, desc="Spell Checking and Correction"):
        corrected_tokens.append(spell.correction(token))
    tokens = corrected_tokens
    
    # 8. Lemmatization (using SpaCy for better quality lemmatization)
    # doc = nlp(' '.join(tokens))
    # tokens = [token.lemma_ for token in doc]
    
    tokens = [token for token in tokens if token is not None and token != ""]
    
    # 9. Detokenize the tokens back into a single string
    text = TreebankWordDetokenizer().detokenize(tokens)

    return text

In [8]:
def load_crawled_data(input_file):
    with open(input_file, "r", encoding="utf-8") as json_file:
        return json.load(json_file)

In [9]:
def preprocess_crawled_data(pdf_documents):
    for doc in pdf_documents:
        doc['content'] = preprocess_text(doc['content'])
    return pdf_documents

In [10]:
def save_crawled_data_to_json(data, output_file):
    """
    Saves the crawled or preprocessed data into a JSON file.

    Args:
    - data (list of dict): The data to save.
    - output_file (str): The path to the output JSON file.
    """
    try:
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
        print(f"Data successfully saved to {output_file}")
    except Exception as e:
        print(f"An error occurred while saving to JSON: {e}")