# Text mining articles - exploration

In [None]:
# When developing it's useful to work with a small subset of articles
# as processing all 70k of them takes a while and consumes more than 11 GB of RAM

# Suggested VM RAM: 20 GB without sampling articles
#                   14 GB is consumed by this process alone

# Uncomment the following to sample:
# n = 10000
# article_filenames = random.sample(list(article_filenames.keys()), n)
# print(f"Sampling {n} articles")

In [None]:
import os
import json
import nltk
import time
import random
from datetime import datetime
import pandas as pd
from plotly import graph_objects as go

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
path_index = "data/scraped/cnet/index_articles.json"
path_text_root = "data/scraped/cnet/articles/parsed"
path_parsed_tokens = "data/scraped/cnet/parsed-articles-index.json"
path_output_root = "data/analysis/text-mining"

In [None]:
# Load index
index = json.load(open(path_index, "r"))
articles = index["articles"] # key: article id, value: article metadata

# Print the number of articles
print(f"Number of articles in the index: {len(articles)}")

In [None]:
# Get the list of article files
article_filenames = os.listdir(path_text_root)
# Convert to dictionary
article_filenames = {f: True for f in article_filenames}
print(f"Number of article files: {len(article_filenames)}")

In [None]:
removed_article_ids = {}
# Remove articles that are not in article_files
for article_id in list(articles.keys()):
	if f"{article_id}.txt" not in article_filenames:
		removed_article_ids[article_id] = articles[article_id]
		del articles[article_id]

print(f"Removed {len(removed_article_ids)} articles that are not in article_files")
print(f"Remaining number of articles: {len(articles)}")


In [None]:
# Get all tokens for all articles
print(f"Loading tokens from {path_parsed_tokens}")
time_start = time.time()
articles_tokens = json.load(open(path_parsed_tokens, "r"))
time_elapsed = time.time() - time_start
print(f"Elapsed time: {time_elapsed:.2f} seconds")

# Print the total number of tokens
total_tokens = 0
for article_id in articles_tokens:
	total_tokens += len(articles_tokens[article_id])
print(f"Total number of tokens: {total_tokens}")

In [None]:
def process_word(word: str) -> str:
	# Stem
	stemmer = nltk.stem.PorterStemmer()
	word = stemmer.stem(word.lower())
	# Remove punctuation
	if not word.isalnum():
		raise Exception(f"Word '{word}' is not alphanumeric")
	return word

sample_processed_word = process_word("shortage")
print(f"Example processed word: '{sample_processed_word}'")

In [None]:
# get n-grams from articles_tokens
time_start = time.time()
ngrams = {}

def get_ngrams(tokens: list, n: int) -> tuple:
	return tuple(nltk.ngrams(tokens, n))

# unigrams
ngrams[1] = {}
print(f"Getting unigrams...")
for i, article_id in enumerate(articles_tokens):
	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
	ngrams[1][article_id] = get_ngrams(articles_tokens[article_id], 1)
print("")

# bigrams
ngrams[2] = {}
print(f"Getting bigrams...")
for i, article_id in enumerate(articles_tokens):
	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
	ngrams[2][article_id] = get_ngrams(articles_tokens[article_id], 2)
print("")

# trigrams
ngrams[3] = {}
print(f"Getting trigrams...")
for i, article_id in enumerate(articles_tokens):
	print(f"{i+1}/{len(articles_tokens)}: {article_id}          ", end="\r")
	ngrams[3][article_id] = get_ngrams(articles_tokens[article_id], 3)
print("")

time_elapsed = time.time() - time_start
print(f"Elapsed time: {time_elapsed:.2f} seconds")
print("")

In [None]:
def get_count_of_ngrams(ngrams: dict) -> dict:
	counts = {}
	for n in ngrams:
		counts[n] = 0
		for article_id in ngrams[n]:
			counts[n] += len(ngrams[n][article_id])
	return counts

ngram_counts = get_count_of_ngrams(ngrams)
print(f"Number of unigrams: {ngram_counts[1]}")
print(f"Number of bigrams: {ngram_counts[2]}")
print(f"Number of trigrams: {ngram_counts[3]}")

In [None]:
def process_query(ngram: tuple) -> tuple:
	return tuple([process_word(word) for word in ngram])

sample_ngram = "chip shortage"
sample_ngram = tuple(sample_ngram.split(" "))
sample_processed_ngram = process_query(sample_ngram)
print(f"'{sample_ngram}' -> '{sample_processed_ngram}'")

In [None]:
# Create a dictionary of ngrams to article ids

ngrams_search = {}
time_start = time.time()

for n in ngrams:
	print(f"Processing {n}-grams...")
	ngrams_search[n] = {}
	for i, article_id in enumerate(ngrams[n]):
		print(f"{i+1}/{len(ngrams[n])}: {article_id}          ", end="\r")
		for ngram in ngrams[n][article_id]:
			ngram_joint = " ".join(ngram)
			if ngram_joint not in ngrams_search[n]:
				ngrams_search[n][ngram_joint] = set()
			ngrams_search[n][ngram_joint].add(article_id)
	print("")

time_elapsed = time.time() - time_start
print(f"Elapsed time: {time_elapsed:.2f} seconds")

In [None]:
def get_article_ids_containing_search_term(search_term: str) -> list:
	# Convert to tuple
	search_term_tuple = tuple(search_term.split(" "))
	n = len(search_term_tuple)
	# Process
	search_term_ngrams = " ".join(process_query(search_term_tuple))
	if n not in ngrams_search:
		return []
	if search_term_ngrams not in ngrams_search[n]:
		return []
	return list(ngrams_search[n][search_term_ngrams])

sample_search_term = "chip shortage"
result_article_ids = get_article_ids_containing_search_term(sample_search_term)
print(f"Search term '{sample_search_term}' appears in {len(result_article_ids)} articles")

In [None]:
joint_article_ids = set()

search_terms = []

# search_terms += [
# 	# "chip",
# 	"chip shortage",
# 	"chip crisis",
# 	"semiconductor shortage",
# 	"chip supply shortage",
# 	"microchip shortage",
# 	"chip scarcity",
# 	"automotive chip shortage",
# ]

search_terms += [
	"covid19",
	"covid",
	"coronavirus",
	"pandemic",
	"lockdown",
	"quarantine",
	"social distancing",
	"wfh",
	"work from home",
	"sars"
]

# search_terms += [
# 	"supply chain",
# 	"supply chain disruption",
# 	"supply chain shortages",
# ]

# search_terms += [
# 	"suez canal",
# 	"container ship",
# 	"ever given",
# 	"evergreen marine",
# ]

# search_terms += [
# 	"low supply",
# 	"high demand",
# 	"high cpu prices",
# 	"high gpu prices",
# 	"high ssd prices",
# 	"high ram prices",
# 	"high memory prices",
# 	"high storage prices",
# 	"high component prices",
# 	"high electronics prices",
# 	"high computer prices",
# 	"high laptop prices",
# 	"high pc prices",
# 	"high smartphone prices",
# ]

# Extended
# search_terms += [
# 	"low supply",
# 	"high demand",
# 	"high prices",
# 	"price hike",
# 	"gpu scalping",
# 	"scarcity",
# ]
# search_terms += [
# 	"global chip shortage",
# 	"global semiconductor shortage",
# 	"global chip supply shortage",
# 	"chip shortage",
# 	"semiconductor shortage",
# 	"chip supply shortage",
# 	"supply chain",
# 	"supply chain disruption",
# 	"supply chain shortages",
# 	"rare earth minerals",
# 	"rare earth metals",
# 	"rare gas",
# 	"extreme weather",
# 	"trade war",
# 	"taiwan",
# 	"tsmc",
# 	"ultrapure water",
# 	"neon gas",
# 	"ukraine war",
# 	"car prices",
# ]

# search_terms += [
# 	"gdpr",
# 	"european commission",
# ]

# search_terms += [
# 	"crypto",
# 	"crypto price",
# 	"cryptocurrency",
# 	"bitcoin",
# 	"ethereum",
# 	"blockchain",
# 	"cryptomining",
# 	"crypto mining",
# 	"cryptocurrency mining",
# 	"scalping",
# 	"scalper",
# 	"nft",
# 	"defi",
# 	"decentralized finance",
# 	"stablecoin",
# 	"crypto wallet",
# 	"crypto miner",
# ]


article_ids_by_search_term = {}
for search_term in search_terms:
	article_ids = get_article_ids_containing_search_term(search_term)
	article_ids_by_search_term[search_term] = article_ids
	# print(f"Search term '{search_term}' appears in {len(article_ids)} articles")
	joint_article_ids = joint_article_ids.union(set(article_ids))
# print("")

# print(f"Number of articles containing any of the search terms: {len(joint_article_ids)} ({len(joint_article_ids) / len(articles) * 100:.2f}%)")

# Print search terms and their counts
print("Search terms and their matching article counts:")
sorted_search_terms = sorted(search_terms, key=lambda x: len(article_ids_by_search_term[x]), reverse=True)
total_search_term_counts = 0
for search_term in sorted_search_terms:
	print(f"- '{search_term}': {len(article_ids_by_search_term[search_term])}")
	total_search_term_counts += len(article_ids_by_search_term[search_term])
print("")
print(f"Number of articles containing any of the search terms: {len(joint_article_ids)} ({len(joint_article_ids) / len(articles) * 100:.2f}% of {len(articles)} articles)")
print(f"Total count of search term matches: {total_search_term_counts}")
print(f"Average search terms per matching article: {total_search_term_counts / len(joint_article_ids):.2f}")

In [None]:
# window_size = 20
window_size = 30 * 3

def get_df_of_article_counts_by_date(article_ids: set) -> pd.DataFrame:
	# Get df of article counts by date where the article contains any of the search terms using joint_article_ids set
	df = pd.DataFrame()
	df["date"] = [datetime.strptime(articles[article_id]["dateCreated"].split(" ")[0], "%Y-%m-%d") for article_id in article_ids]
	df = df.set_index("date")
	
	# Sort by date
	df = df.sort_index()
	
	# Get counts
	df = df.resample("D").size() # resample to daily frequency

	# Add all missing dates from 2019-01-01 to 2023-12-31 if they are not already in the df index and set count to 0
	start_date = "2019-01-01"
	end_date = "2023-12-31"
	all_dates = pd.date_range(start=start_date, end=end_date)
	df = df.reindex(all_dates, fill_value=0)

	# make a new column with count
	df = df.reset_index()
	df.columns = ["date", "count"]
	df = df.set_index("date")

	# Get SMA

	df["sma"] = df["count"].rolling(window_size).mean()

	# # Get EMA
	df["ema"] = df["count"].ewm(span=window_size).mean()

	# Comment these two out to get daily data (in-depth exploration why some days have so many articles)
	# resample to other frequencies
	# df = df.resample("M").sum()
	df = df.resample("W").sum()
	# remove day component from index (if "M")
	# df.index = df.index.map(lambda x: x.strftime("%Y-%m"))

	return df

df = get_df_of_article_counts_by_date(joint_article_ids)

# df.head()
df.tail()

In [None]:
# Plot the number of articles containing the word for each date as a line chart

fig = go.Figure()

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["count"],
# 	name="Count",
# ))


fig.add_trace(go.Bar(
	x=df.index,
	y=df["count"],
	name="Count",
))

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["sma"],
# 	name=f"SMA-{window_size}",
# ))

# fig.add_trace(go.Scatter(
# 	x=df.index,
# 	y=df["ema"],
# 	name=f"EMA-{window_size}",
# ))

def break_up_title(title: str, characters_per_line: int) -> str:
	words = title.split(" ")
	lines = []
	line = ""
	for word in words:
		if len(line) + len(word) + 1 <= characters_per_line:
			if len(line) > 0:
				line += " "
			line += word
		else:
			lines.append(line)
			line = word
	if len(line) > 0:
		lines.append(line)
	return "<br>".join(lines)

fig.update_layout(
	# title=f"Weekly number of articles containing any of the search terms {search_terms}",
	title = break_up_title(f"Weekly number of articles containing any of the search terms: {search_terms}", 100),
	xaxis_title="Date",
	yaxis_title="Count",
)

# Make it 1500 x 1000
fig.update_layout(
	width=1000,
	height=400,
	# height=600,
)

# Margins
fig.update_layout(
	margin=dict(l=50, r=30, t=70, b=10),
	# margin=dict(l=50, r=30, t=270, b=10),
)

fig.show()

plot_name = "article-counts-weekly-covid"
# plot_name = "article-counts-monthly-covid"
# plot_name = "article-counts-weekly-chip-shortage"
# plot_name = "article-counts-monthly-chip-shortage"
# plot_name = "article-counts-weekly-chip-shortage-big"
# plot_name = "article-counts-monthly-chip-shortage-big"
# plot_name = "article-counts-monthly-crypto"
# save as png with 3x scale
fig.write_image(os.path.join(path_output_root, f"{plot_name}.png"), scale=3)

In [None]:
# TODO weight by the number of daily articles ?

# TODO: plot multiple search terms on the same chart

In [None]:
# Print search terms and their counts
print("Search terms and their matching article counts:")
sorted_search_terms = sorted(search_terms, key=lambda x: len(article_ids_by_search_term[x]), reverse=True)
total_search_term_counts = 0
for search_term in sorted_search_terms:
	print(f"- '{search_term}': {len(article_ids_by_search_term[search_term])}")
	total_search_term_counts += len(article_ids_by_search_term[search_term])
print("")
print(f"Number of articles containing any of the search terms: {len(joint_article_ids)} ({len(joint_article_ids) / len(articles) * 100:.2f}% of {len(articles)} articles)")
print(f"Total count of search term matches: {total_search_term_counts}")
print(f"Average search terms per matching article: {total_search_term_counts / len(joint_article_ids):.2f}")

In [None]:
def print_search_term_counts_for_date(article_ids_by_search_term: dict, date: str):
	print(f"Search terms and their matching article counts for date '{date}':")
	search_term_counts = {}
	for search_term in search_terms:
		count = 0
		for article_id in article_ids_by_search_term[search_term]:
			if articles[article_id]["dateCreated"].split(" ")[0] == date:
				count += 1
		search_term_counts[search_term] = count
	sorted_search_terms = sorted(search_terms, key=lambda x: search_term_counts[x], reverse=True)
	for search_term in sorted_search_terms:
		print(f"- '{search_term}': {search_term_counts[search_term]}")
	print("")
	print(f"Total search term counts for date '{date}': {sum(search_term_counts.values())}")
	

# Print search term counts for a specific date
print_search_term_counts_for_date(article_ids_by_search_term, "2022-03-04")

In [None]:
def print_articles_for_search_term_date(search_term: str, date: str):
	article_ids = article_ids_by_search_term[search_term]
	article_ids = [article_id for article_id in article_ids if articles[article_id]["dateCreated"].split(" ")[0] == date]
	print(f"Search term '{search_term}' appears in {len(article_ids)} articles on {date}")
	for article_id in article_ids:
		print(f"- {article_id}: {articles[article_id]['title']}")

print_articles_for_search_term_date("coronavirus", "2022-03-04")

In [None]:
def load_file(filepath: str) -> str:
	with open(filepath, "r") as f:
		return f.read()

In [None]:
article_id = "0f8b1479-04b7-467c-ad94-ed843b90505f"

article_text = load_file(os.path.join(path_text_root, f"{article_id}.txt"))
# article_text = article_text.replace("\n\n", "\n")
article_text = article_text.replace("\n", " ")

words_per_line = 50
print("Article text:")
print(break_up_title(article_text, words_per_line).replace("<br>", "\n"))
print("\n\n\n")
print("Article tokens:")
print(break_up_title(" ".join(articles_tokens[article_id]), words_per_line).replace("<br>", "\n"))