# Text mining articles - exploration

In [None]:
import os
import json
import nltk
import time
import random
from datetime import datetime
import pandas as pd
from plotly import graph_objects as go

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
path_index = "data/scraped/cnet/index_articles.json"
path_text_root = "data/scraped/cnet/articles/parsed"

In [None]:
# Load index
index = json.load(open(path_index, "r"))
articles = index["articles"] # key: article id, value: article metadata

# Print the number of articles
print(f"Number of articles in the index: {len(articles)}")

In [None]:
# Print an example article object
print("Example article object:")
sample_id = list(articles.keys())[0]
print(json.dumps(articles[sample_id], indent=2))

In [None]:
# Get the list of article files
article_filenames = os.listdir(path_text_root)
# Convert to dictionary
article_filenames = {f: True for f in article_filenames}
print(f"Number of article files: {len(article_filenames)}")

In [None]:
# use ntlk to get total word count (non-processed word tokens) of all articles
total_character_count = 0
total_word_count = 0

for i, filename in enumerate(article_filenames):
	print(f"{i+1}/{len(article_filenames)}       ", end="\r")
	path = os.path.join(path_text_root, filename)
	with open(path, "r") as file:
		text = file.read()
		words = nltk.word_tokenize(text)
		total_word_count += len(words)
		total_character_count += len(text)

print(f"Total word count: {total_word_count}")
print(f"Total character count: {total_character_count}")

In [None]:
# When developing it's useful to work with a small subset of articles
# as processing all 70k of them takes a while and consumes more than 11 GB of RAM

# Suggested VM RAM: 20 GB without sampling articles
#                   14 GB is consumed by this process alone

# Uncomment the following to sample:
# n = 10000
# article_filenames = random.sample(list(article_filenames.keys()), n)
# print(f"Sampling {n} articles")

In [None]:
removed_article_ids = {}
# Remove articles that are not in article_files
for article_id in list(articles.keys()):
	if f"{article_id}.txt" not in article_filenames:
		removed_article_ids[article_id] = articles[article_id]
		del articles[article_id]

print(f"Removed {len(removed_article_ids)} articles that are not in article_files")
print(f"Remaining number of articles: {len(articles)}")


In [None]:
def load_file(filepath: str) -> str:
	with open(filepath, "r") as f:
		return f.read()

sample_text = load_file(os.path.join(path_text_root, f"{sample_id}.txt"))
print(f"Example article text (first 100 characters):\n{sample_text[:100]}...")

In [None]:
# Tokenize the text
sample_tokens = nltk.word_tokenize(sample_text)

# Print sample
n = 10
print(f"Example article tokens (first {n} tokens out of {len(sample_tokens)}):")
for i, token in enumerate(sample_tokens[:n]):
	print(f"{i+1}. '{token}'")

In [None]:
# Get stopwords
sample_stopwords = nltk.corpus.stopwords.words('english')

# Print sample stopwords
n = 10
print(f"Example stopwords (first {n} stopwords out of {len(sample_stopwords)}):")
for i, token in enumerate(sample_stopwords[:n]):
	print(f"{i+1}. '{token}'")

In [None]:
# Remove stopwords
sample_stopwords_set = set(sample_stopwords)
sample_tokens_no_stopwords = [token for token in sample_tokens if token not in sample_stopwords_set]

# Print sample tokens without stopwords
n = 10
print(f"Example tokens without stopwords (first {n} tokens out of {len(sample_tokens_no_stopwords)}):")
for i, token in enumerate(sample_tokens_no_stopwords[:n]):
	print(f"{i+1}. '{token}'")

In [None]:
# Remove stemming
stemmer = nltk.stem.PorterStemmer()
sample_tokens_stemmed = [stemmer.stem(token.lower()) for token in sample_tokens_no_stopwords]

# Print sample tokens without stemming
n = 10
print(f"Example tokens without stemming (first {n} tokens out of {len(sample_tokens_stemmed)}):")
for i, token in enumerate(sample_tokens_stemmed[:n]):
	print(f"{i+1}. '{token}'")

In [None]:
# Remove punctuation with nltk
sample_tokens_no_punctuation = [token for token in sample_tokens_stemmed if token.isalnum()]

# Print sample tokens without punctuation
n = 10
print(f"Example tokens without punctuation (first {n} tokens out of {len(sample_tokens_no_punctuation)}):")
for i, token in enumerate(sample_tokens_no_punctuation[:n]):
	print(f"{i+1}. '{token}'")

In [None]:
sample_tokens_processed = sample_tokens_no_punctuation

In [None]:
# Get number of unique tokens
sample_tokens_unique = set(sample_tokens_processed)
print(f"Number of tokens: {len(sample_tokens_processed)}")
print(f"Number of unique tokens: {len(sample_tokens_unique)}")
# Get token frequency
sample_token_freq = nltk.FreqDist(sample_tokens_processed)
print(f"Most common tokens: {sample_token_freq.most_common(10)}")

In [None]:
def get_clean_tokens(text: str) -> list:
	# Tokenize
	tokens = nltk.word_tokenize(text)
	# Remove stopwords
	stopwords_set = set(nltk.corpus.stopwords.words('english'))
	tokens = [token for token in tokens if token not in stopwords_set]
	# Stem
	stemmer = nltk.stem.PorterStemmer()
	tokens = [stemmer.stem(token.lower()) for token in tokens]
	# Remove punctuation
	tokens = [token for token in tokens if token.isalnum()]
	return tokens

sample_tokens_processed = get_clean_tokens(sample_text)

# Get number of unique tokens
sample_tokens_unique = set(sample_tokens_processed)
print(f"Number of tokens: {len(sample_tokens_processed)}")
print(f"Number of unique tokens: {len(sample_tokens_unique)}")
# Get token frequency
sample_token_freq = nltk.FreqDist(sample_tokens_processed)
print(f"Most common tokens: {sample_token_freq.most_common(10)}")

In [None]:
# Get all tokens for all articles
articles_tokens = {} # key: article id, value: list of tokens
time_start = time.time()

for i, article_id in enumerate(articles):
	# print(f"{i+1}/{len(articles)}: {article_id}          ", end="\r")
	time_elapsed = time.time() - time_start
	time_per_article = time_elapsed / (i+1)
	time_remaining = time_per_article * (len(articles) - i)
	print(f"{i+1}/{len(articles)}: {article_id}   ; Time remaining: {time_remaining:.2f} seconds          ", end="\r")
	article_text = load_file(os.path.join(path_text_root, f"{article_id}.txt"))
	articles_tokens[article_id] = get_clean_tokens(article_text)
print("")
time_elapsed = time.time() - time_start
print(f"Elapsed time: {time_elapsed:.2f} seconds")

# Print the total number of tokens
total_tokens = 0
for article_id in articles_tokens:
	total_tokens += len(articles_tokens[article_id])
print(f"Total number of tokens: {total_tokens}")

In [None]:
print(f"Total number of tokens: {total_tokens}")

In [None]:
# # Iterate over all tokens and count the number of articles that contain each token
# token_article_sets = {} # key: token, value: set of article ids

# for i, article_id in enumerate(articles_tokens):
# 	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
# 	for token in articles_tokens[article_id]:
# 		if token not in token_article_sets:
# 			token_article_sets[token] = set()
# 		token_article_sets[token].add(article_id)
# print("")

# # Get the number of articles that contain each token
# token_article_counts = {} # key: token, value: number of articles that contain the token
# for token in token_article_sets:
# 	token_article_counts[token] = len(token_article_sets[token])

# # Get a sorted list of tokens by their number of articles
# sorted_tokens = sorted(token_article_counts.items(), key=lambda x: x[1], reverse=True)

# # Print the most common tokens
# n = 10
# print(f"Most common tokens (first {n} tokens out of {len(sorted_tokens)}):")
# for i, (token, count) in enumerate(sorted_tokens[:n]):
# 	print(f"{i+1}. '{token}': {count}")


In [None]:
def process_word(word: str) -> str:
	# Stem
	stemmer = nltk.stem.PorterStemmer()
	word = stemmer.stem(word.lower())
	# Remove punctuation
	if not word.isalnum():
		raise Exception(f"Word '{word}' is not alphanumeric")
	return word

sample_processed_word = process_word("shorage")
print(f"Example processed word: '{sample_processed_word}'")

In [None]:
# TODO: process n-grams (bigrams, trigrams, etc.) - to make "chip shortage" queryable

In [None]:
# # Get article ids that contain a specific word

# def get_article_ids_containing_word(word: str) -> list:
# 	word = process_word(word)
# 	if word not in token_article_sets:
# 		return []
# 	return list(token_article_sets[word])

# sample_word = "semiconductor" # "chip" # "shortage"
# sample_article_ids = get_article_ids_containing_word(sample_word)
# print(f"Word '{sample_word}' appears in {len(sample_article_ids)} articles")

In [None]:
# # From the article ids, get the article objects and use "dateCreated" to count how many articles which contain the word were published each day
# # "dateCreated" is formatted as "2020-01-04 12:00:03"

# def get_article_counts_by_date(article_ids: list) -> dict:
# 	article_counts_by_date = {} # key: date, value: number of articles published on that date
# 	for article_id in article_ids:
# 		date = articles[article_id]["dateCreated"].split(" ")[0]
# 		if date not in article_counts_by_date:
# 			article_counts_by_date[date] = 0
# 		article_counts_by_date[date] += 1
# 	return article_counts_by_date

# sample_article_counts_by_date = get_article_counts_by_date(sample_article_ids)

# # Get a sorted list of dates (oldest to newest)
# sorted_dates = sorted(sample_article_counts_by_date.items(), key=lambda x: x[0])

# # Print the number of articles containing the word for each date
# n = 10
# print(f"Number of articles containing '{sample_word}' (first {n} dates out of {len(sorted_dates)}):")
# for i, (date, count) in enumerate(sorted_dates[:n]):
# 	print(f"{i+1}. '{date}': {count}")

In [None]:
# # Convert the sorted list of article counts to a dataframe

# df = pd.DataFrame(sorted_dates, columns=["date", "count"])
# df["date"] = pd.to_datetime(df["date"])
# df = df.set_index("date")
# df = df.resample("D").sum() # resample to daily frequency
# df = df.sort_index()

# df.head()

In [None]:
# # Plot the number of articles containing the word for each date as a line chart

# # Get SMA (simple moving average) of the number of articles
# window_size = 20
# df["sma"] = df["count"].rolling(window_size).mean()
# df["ema"] = df["count"].ewm(span=window_size).mean()

# fig = go.Figure()

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["count"],
# 	name="Count",
# ))

# # fig.add_trace(go.Scatter(
# # 	x=df.index,
# # 	y=df["sma"],
# # 	name="SMA",
# # ))

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["ema"],
# 	name="EMA",
# ))

# fig.update_layout(
# 	title=f"Number of articles containing '{sample_word}' per day",
# 	xaxis_title="Date",
# 	yaxis_title="Number of articles",
# )

# fig.show()

In [None]:
# get n-grams from articles_tokens
ngrams = {}

def get_ngrams(tokens: list, n: int) -> tuple:
	return tuple(nltk.ngrams(tokens, n))

# unigrams
ngrams[1] = {}
print(f"Getting unigrams...")
for i, article_id in enumerate(articles_tokens):
	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
	ngrams[1][article_id] = get_ngrams(articles_tokens[article_id], 1)
print("")

# bigrams
ngrams[2] = {}
print(f"Getting bigrams...")
for i, article_id in enumerate(articles_tokens):
	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
	ngrams[2][article_id] = get_ngrams(articles_tokens[article_id], 2)
print("")

# trigrams
ngrams[3] = {}
print(f"Getting trigrams...")
for i, article_id in enumerate(articles_tokens):
	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
	ngrams[3][article_id] = get_ngrams(articles_tokens[article_id], 3)
print("")

print(f"Number of unigrams: {len(ngrams[1])}")
print(f"Number of bigrams: {len(ngrams[2])}")
print(f"Number of trigrams: {len(ngrams[3])}")

In [None]:
def process_query(ngram: tuple) -> tuple:
	return tuple([process_word(word) for word in ngram])

sample_ngram = "chip shortage"
sample_ngram = tuple(sample_ngram.split(" "))
sample_processed_ngram = process_query(sample_ngram)
print(f"'{sample_ngram}' -> '{sample_processed_ngram}'")

In [None]:
# Create a dictionary of ngrams to article ids

ngrams_search = {}

for n in ngrams:
	print(f"Processing {n}-grams...")
	ngrams_search[n] = {}
	for i, article_id in enumerate(ngrams[n]):
		print(f"{i+1}/{len(ngrams[n])}: {article_id}          ", end="\r")
		for ngram in ngrams[n][article_id]:
			ngram_joint = " ".join(ngram)
			if ngram_joint not in ngrams_search[n]:
				ngrams_search[n][ngram_joint] = set()
			ngrams_search[n][ngram_joint].add(article_id)
	print("")

print("Done")

In [None]:
# # Clear up some memory - TODO - fix this
# del ngrams
# ngrams = ngrams_search
# del ngrams_search

In [None]:
a = 0

In [None]:
def get_article_ids_containing_search_term(search_term: str) -> list:
	# Convert to tuple
	search_term_tuple = tuple(search_term.split(" "))
	n = len(search_term_tuple)
	# Process
	search_term_ngrams = " ".join(process_query(search_term_tuple))
	if n not in ngrams_search:
		return []
	if search_term_ngrams not in ngrams_search[n]:
		return []
	return list(ngrams_search[n][search_term_ngrams])

sample_search_term = "chip shortage"
result_article_ids = get_article_ids_containing_search_term(sample_search_term)
print(f"Search term '{sample_search_term}' appears in {len(result_article_ids)} articles")

In [None]:
joint_article_ids = set()

search_terms = []

# search_terms += [
# 	# "chip",
# 	"chip shortage",
# 	"semiconductor shortage",
# 	"chip supply shortage",
# ]

search_terms += [
	"covid19",
	"covid",
	"coronavirus",
	"pandemic",
	"lockdown",
	# "quarantine",
	"social distancing",
	"wfh",
	"work from home",
]

# search_terms += [
# 	"supply chain",
# 	"supply chain disruption",
# 	"supply chain shortages",
# ]

# search_terms += [
# 	"suez canal",
# 	"container ship",
# 	"ever given",
# 	"evergreen marine",
# ]

# search_terms += [
# 	"low supply",
# 	"high demand",
# 	"high cpu prices",
# 	"high gpu prices",
# 	"high ssd prices",
# 	"high ram prices",
# 	"high memory prices",
# 	"high storage prices",
# 	"high component prices",
# 	"high electronics prices",
# 	"high computer prices",
# 	"high laptop prices",
# 	"high pc prices",
# 	"high smartphone prices",
# ]

# search_terms += [
# 	"global chip shortage",
# 	"global semiconductor shortage",
# 	"global chip supply shortage",
# 	"chip shortage",
# 	"semiconductor shortage",
# 	"chip supply shortage",
# 	"supply chain",
# 	"supply chain disruption",
# 	"supply chain shortages",
# 	"rare earth minerals",
# 	"rare earth metals",
# 	"rare gas",
# 	"extreme weather",
# 	"trade war",
# ]

# search_terms = [
# 	"gdpr",
# 	"european commission",
# ]

article_ids_by_search_term = {}
for search_term in search_terms:
	article_ids = get_article_ids_containing_search_term(search_term)
	article_ids_by_search_term[search_term] = article_ids
	print(f"Search term '{search_term}' appears in {len(article_ids)} articles")
	joint_article_ids = joint_article_ids.union(set(article_ids))
print("")

print(f"Number of articles containing any of the search terms: {len(joint_article_ids)} ({len(joint_article_ids) / len(articles) * 100:.2f}%)")

In [None]:
def get_df_of_article_counts_by_date(article_ids: set) -> pd.DataFrame:
	# Get df of article counts by date where the article contains any of the search terms using joint_article_ids set
	df = pd.DataFrame()
	df["date"] = [datetime.strptime(articles[article_id]["dateCreated"].split(" ")[0], "%Y-%m-%d") for article_id in article_ids]
	df = df.set_index("date")
	
	# Sort by date
	df = df.sort_index()
	
	# Get counts
	df = df.resample("D").size() # resample to daily frequency

	# Add all missing dates from 2019-01-01 to 2023-12-31 if they are not already in the df index and set count to 0
	start_date = "2019-01-01"
	end_date = "2023-12-31"
	all_dates = pd.date_range(start=start_date, end=end_date)
	df = df.reindex(all_dates, fill_value=0)

	# make a new column with count
	df = df.reset_index()
	df.columns = ["date", "count"]
	df = df.set_index("date")

	# Get SMA
	window_size = 20
	df["sma"] = df["count"].rolling(window_size).mean()

	# # Get EMA
	df["ema"] = df["count"].ewm(span=window_size).mean()

	# Comment these two out to get daily data (in-depth exploration why some days have so many articles)
	# resample to other frequencies
	# df = df.resample("M").sum()
	# remove day component from index
	# df.index = df.index.map(lambda x: x.strftime("%Y-%m"))

	return df

df = get_df_of_article_counts_by_date(joint_article_ids)

df.tail()

In [None]:
# Plot the number of articles containing the word for each date as a line chart

fig = go.Figure()

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["count"],
# 	name="Count",
# ))


fig.add_trace(go.Bar(
	x=df.index,
	y=df["count"],
	name="Count",
))

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["sma"],
# 	name="SMA",
# ))

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["ema"],
# 	name="EMA",
# ))

fig.update_layout(
	title=f"Number of articles containing any of the search terms per month",
	xaxis_title="Date",
	yaxis_title="Number of articles",
)

fig.show()

In [None]:
# TODO weight by the number of daily articles ?

# TODO: plot multiple search terms on the same chart

In [None]:
# Print search terms and their counts
print("Search terms and their counts:")
for search_term in search_terms:
	print(f"'{search_term}': {len(article_ids_by_search_term[search_term])}")

In [None]:
def print_search_term_counts_for_date(article_ids_by_search_term: dict, date: str):
	print(f"Search term counts for date '{date}':")
	for search_term in search_terms:
		count = 0
		for article_id in article_ids_by_search_term[search_term]:
			if articles[article_id]["dateCreated"].split(" ")[0] == date:
				count += 1
		print(f"'{search_term}': {count}")

# Print search term counts for a specific date
print_search_term_counts_for_date(article_ids_by_search_term, "2022-03-04")

In [None]:
def print_articles_for_search_term_date(search_term: str, date: str):
	article_ids = article_ids_by_search_term[search_term]
	article_ids = [article_id for article_id in article_ids if articles[article_id]["dateCreated"].split(" ")[0] == date]
	print(f"Search term '{search_term}' appears in {len(article_ids)} articles on {date}")
	for article_id in article_ids:
		print(f"- {article_id}: {articles[article_id]['title']}")

print_articles_for_search_term_date("coronavirus", "2022-03-04")

In [None]:
article_id = "576119ec-c911-41e9-bc91-1b4ea4ffb3a9"

article_text = load_file(os.path.join(path_text_root, f"{article_id}.txt"))

print(article_text)

In [None]:
# TODO filter out paragraphs that start with "see also"