In [None]:
import os
import json
import nltk
import time

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
path_text_root = "data/scraped/cnet/articles/parsed"
path_index_output = "data/scraped/cnet/parsed-articles-index.json"

In [None]:
# Merge all the text files into one big dictionary

def get_text(file_path: str) -> str:
	with open(file_path, "r") as f:
		return f.read()
	
index = {}
filelist = os.listdir(path_text_root)

for i, filename in enumerate(filelist):
	article_id = filename.split(".")[0]
	print(f"{i+1}/{len(filelist)}   {article_id}        ", end="\r")
	file_path = os.path.join(path_text_root, filename)
	index[article_id] = get_text(file_path)

print("")
print(f"Read {len(index)} articles")

In [None]:
empty_articles = [k for k, v in index.items() if len(v.strip()) == 0]
print(f"Found {len(empty_articles)} empty articles")

# Remove empty articles
for article_id in empty_articles:
	del index[article_id]

print(f"Remaining {len(index)} articles")


In [None]:
# Write the index to a file
json.dump(index, open(path_index_output, "w"), indent=2)

In [None]:
# TODO: remove paragraphs that start with "see also"

In [None]:
def get_clean_tokens(text: str) -> list:
	# Tokenize
	tokens = nltk.word_tokenize(text)
	# Remove stopwords
	stopwords_set = set(nltk.corpus.stopwords.words('english'))
	tokens = [token for token in tokens if token not in stopwords_set]
	# Stem
	stemmer = nltk.stem.PorterStemmer()
	tokens = [stemmer.stem(token.lower()) for token in tokens]
	# Remove punctuation
	tokens = [token for token in tokens if token.isalnum()]
	return tokens

time_start = time.time()
# Process the text in index
index_processed = {}
for i, (article_id, text) in enumerate(index.items()):
	time_elapsed = time.time() - time_start
	time_per_article = time_elapsed / (i+1)
	time_remaining = time_per_article * (len(index) - i)
	print(f"{i+1}/{len(index)}: {article_id}   ; Time remaining: {time_remaining:.2f} seconds          ", end="\r")
	index_processed[article_id] = get_clean_tokens(text)
print("")
elapsed_time = time.time() - time_start
print(f"Processed {len(index)} articles in {elapsed_time:.2f} seconds")

In [None]:
# Write the processed index to a file
json.dump(index_processed, open(path_index_output, "w"), indent=2)