In [8]:
import re
from nltk.corpus import stopwords
import json
from nltk.stem import PorterStemmer

STOPWORDS = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    # convert to lowercase
    lower_cased = text.lower()
    # remove references in square brackets
    no_references = re.sub(r'\[.*?\]', '', lower_cased)
    # keep only alphanumeric characters
    alphanumeric = re.sub(r'\W+', ' ', no_references)
    # remove strange unicode characters
    alphanumeric = re.sub(r'[^\x00-\x7F]+', '', alphanumeric)
    # remove stopwords
    no_stopwords = ' '.join([word for word in alphanumeric.split() if word not in STOPWORDS])
    # stemmize
    stemmed = ' '.join([PorterStemmer().stem(word) for word in no_stopwords.split()])
    return stemmed

def clean_page(page: dict) -> dict:
    # apply the clean_text function to each paragraph in the page
    cleaned_page = {}
    # clean the URL and title
    cleaned_page['url'], cleaned_page['title'] = page['url'], clean_text(page['title'])
    # clean the summary paragraphs
    if page['summary']:
        cleaned_page['summary_ids'] = page['summary_ids']
        cleaned_page['summary'] = [clean_text(paragraph) for paragraph in page['summary']]
    # clean the content sections
    cleaned_page['content'] = []
    for section in page['content']:
        if not section['paragraphs']:
            continue
        cleaned_section = {}
        cleaned_section['type'] = section['type']
        # clean the section title
        cleaned_section['title'] = clean_text(section['title'])
        # clean each paragraph in the section
        cleaned_section['paragraphs'] = [clean_text(paragraph) for paragraph in section['paragraphs'] if clean_text(paragraph)]
        cleaned_section['ids'] = section['ids']
        cleaned_page['content'].append(cleaned_section)
    return cleaned_page

In [2]:
with open('../data/raw_wiki_with_ids.json', 'r') as f:
    data = json.load(f)

In [9]:
cleaned_data = [clean_page(page) for page in data]

In [10]:
# save the cleaned data
with open('../data/cleaned_wiki.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)