In [None]:
import os
import re
import json
from bs4 import BeautifulSoup as bs

In [None]:
path_index = "data/scraped/cnet/index_articles.json"
path_data_root = "data/scraped/cnet/articles"
path_html = os.path.join(path_data_root, "html")

In [None]:
index = json.load(open(path_index))
print(f"Got {len(index['articles'])} articles in index")

In [None]:
# Print number of downloaded html files
html_files = os.listdir(path_html)
print(f"Got {len(html_files)} html files")

In [None]:
# example id: c5063326-855f-4179-a640-84e53d0204a6
path_sample_html = os.path.join(path_html, html_files[0])
print(f"Sample html file path: {path_sample_html}")
article_id = html_files[0].split(".")[0]
print(f"Article id: '{article_id}'")
article = index["articles"][article_id]
if "image" in article:
	del article["image"]

In [None]:
article_slug = article["slug"]
print(f"Visit article: https://www.cnet.com/news/{article_slug}")

In [None]:
print(f"Article (without image key): {json.dumps(article, indent=2)}")

In [None]:
article_data = ""
with open(path_sample_html, "r") as f:
	article_data = f.read()
print(f"Article data length: {len(article_data)}")

In [None]:
# print(article_data)

In [None]:
soup = bs(article_data, "html.parser")

In [None]:
def remove_tags(soup, tagname):
	for tag in soup([tagname]):
		tag.extract()

remove_tags(soup, "script")
remove_tags(soup, "style")
remove_tags(soup, "symbol")
remove_tags(soup, "svg")
remove_tags(soup, "path")
remove_tags(soup, "figure")
remove_tags(soup, "picture")

print("Article length after removing tags: ", len(str(soup)))

In [None]:
# Print formatted html
# print(soup.prettify())

In [None]:
# verify title
title = soup.find("title").text.replace(" - CNET", "").strip()
print(f"Title from article: '{title}'")
print(f"Title from index:   '{article['title']}'")

In [None]:
# find div with id="page-article-{article_id}"
article_div = soup.find("div", {"id": f"page-article-{article_id}"})
# print(article_div.text)

In [None]:
# get dov with class="c-pageArticle_body"
article_body_div = article_div.find("div", {"class": "c-pageArticle_body"})
# print(article_body_div.text)

In [None]:
# get div which has "c-ShortcodeContent" in its class
article_content_div = article_body_div.find("div", {"class": lambda x: x and "c-ShortcodeContent" in x})
# print(article_body_div.text)

In [None]:
# get all first level p tags
paragraphs = article_content_div.find_all("p", recursive=True)
print(f"Got {len(paragraphs)} paragraphs")

In [None]:
def process_paragraph(p):
	res_text = p.text
	res_text = res_text.replace("\n", " ")
	res_text = res_text.replace("\xa0", " ")
	res_text = res_text.strip()
	res_text = re.sub(r"\s+", " ", res_text)
	return res_text


paragraphs_processed = list(map(process_paragraph, paragraphs))
print("\n\n".join(paragraphs_processed))

In [None]:
# print word count
word_count = sum(map(lambda x: len(x.split()), paragraphs_processed))
print(f"Word count: {word_count}")