The goal of this milestone is for you to scrape data from an actual website and start to analyze the results. The project should be completed in individually in python without the assistance of any artificial intelligences.

Web Scraper - Take all of the text from the top 20 articles coming from one of the following web news sources:

espn.com
cnn.com
goodblacknews.org
huffingtonpost.com
ign.com
theonion.com

Statistics - Print a list of the titles of the web pages you are pulling text from and then print the mean and median number of words for all 20 articles.

Visualize - Create a Word Cloud using your list of most common words that shows the top 50 (or up to 200) words and a bar chart to show the relative frequencies of the top 15 most frequent words. (Note: these should be words that the average viewer of the website would see, not code from the html)

In [1]:
# This program uses the first four variables to scrape article text from randomly selected articles linked on the main page
# IDS Project 1 Milestone 2
# Liam Zalubas
# help coming from https://lxml.de/parsing.html#parsers for lxml documentation
from bs4 import BeautifulSoup as bs
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import requests
import string
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download('stopwords')

#change this to check more articles
articlesToUse:int = 0
wordsForCloud:int = 50
wordsForGraph:int = 15
base_url = "https://www.cnn.com"

response = requests.get(base_url)
# response encoding is utf-8

with open("main_page.html", "wb") as htmlFile:
    htmlFile.write(response.content)

# now we have main page downloaded, and we can parse it for article links
# THE ENCODING PART HERE IS REALLY IMPORTANT OR ELSE NOTHING WORKS
fileWrap = open("main_page.html", "r", encoding="utf-8")

# this converts the open file into a single string variable
fileText: str = fileWrap.read()

# turning the downloaded file into a BS4 object
bsText = bs(fileText, "html.parser")

# list to store all valid article links
articles = []

# get all the potential links that match patterns of articles
for link in bsText.find_all('a'):
    currentLink: str = link.get('href')
    try:
        # pattern to save: all articles start with "/" and end with ".html"
        if (currentLink.startswith('/') and currentLink.endswith('.html')):
            articles.append(base_url+currentLink)
    except:
        print("#")

while (articlesToUse < 1 or articlesToUse > len(articles)):
    articlesToUse = int(input("How many articles do you want to process?\n")) 

# now that we have all the articles, we pick {randomArticleCount:int} at random and analyze the results
# start with a list to put our words and their frequencies in
wordsList = {}
# and one for the titles to use later
titleArray = []
#article word counts to use as data for statistics
articleWordCounts = [0 for i in range(articlesToUse)]
# some resources to use for the method below
punc = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words("english"))
stop_words.add("said")
stop_words.add("the")
unwantedThings = {'“', '”', '–', '’s', '—'}

# method to scrub words and pass them to the adder method
def cleanWord(thisWord:str, articleNum:int):
    cleanedWord = thisWord
    #remove all numbers: if a number is found, return without adding word
    for num in range(0,9):
        if (cleanedWord.find(str(num)) != -1):
            return
    #make it all lowercase and remove most punctuation
    cleanedWord = cleanedWord.casefold().translate(punc)
    #temporary fix for manually-identified punctuation outliers 
    for symbol in unwantedThings:
        cleanedWord = cleanedWord.replace(symbol, ' ').strip()
    #completely ditch the word if it's just an empty string
    if (cleanedWord != ''):
        #if the above operations turned the word into multiple words, recurse this method for each new word
        if (cleanedWord.find(' ') != -1):
            cleanedWord.split()
            for i in range(len(cleanedWord)):
                cleanWord(cleanedWord[i], articleNum)
        incrementWord(cleanedWord, articleNum)
    else:
        return

# this is a method to send valid words into wordsList or increment their count
def incrementWord(thisWord: str, articleNum:int):
    # add 1 to the article's word count
    articleWordCounts[articleNum] += 1
    # for each word key in the dictionary:
    # if already exists, add one to the frequency counter
    # else, add it to the words list and start at frequency=1
    if thisWord in wordsList:
        wordsList[thisWord] += 1
    else:
        wordsList[thisWord] = 1

#randomize the links within the list, then use the first {articlesToUse:int} articles.
np.random.shuffle(articles)

for i in range(articlesToUse):
    # turns the entire request page into a BS4 object
    thisArticle = bs(requests.get(articles[i]).text, "html.parser")
    # print the name of the article we're about to check
    print(thisArticle.title.text)# type:ignore
    titleArray.append((thisArticle.title.text).replace(" | CNN",""))# type:ignore
    # now narrow down the text we're interested in to just the article itself
    # for each piece of text in a new tag from the last, insert a space between them
    textBlocks = thisArticle.article.get_text(strip=True, separator='\n').splitlines() # type:ignore
    # print(textBlocks)
    # next I need to tokenize every word I can find, removing punctuation as I go
    # for each string in the textBlocks list, convert sentences into single word tokens and add to wordList
    # create a translator using string class punctuation table
    # for each text block we have, break each down into words and process them
    for j in range(len(textBlocks)):
        # take apart the whole string into word tokens
        wordTokens = textBlocks[j].strip().split()
        filtered_tokens = []
        for word in wordTokens:
            if word not in stop_words:
                filtered_tokens.append(word)
        # for each token, add it to the list if contains alphabet letters
        for k in range(len(filtered_tokens)):
            cleanWord(filtered_tokens[k], i)

# Sort the word counts in descending order.
sortedWordsList = sorted(wordsList.items(), key=lambda item: item[1], reverse=True)

#print the mean and median values for word count
print("Mean word count:",np.average(articleWordCounts))
for i in range(len(articleWordCounts)):
    print(articleWordCounts[i],"words in article:",titleArray[i])
print("Median word count:",articleWordCounts[int(len(articleWordCounts)/2)])

# too many words to fit on the word cloud, here we cut the end of the list
while (len(sortedWordsList) > wordsForCloud):
    sortedWordsList.pop()

# now we make the word cloud with 50 words
# Create a WordCloud object.
wordcloud = WordCloud()

# Generate the word cloud.
wordcloud.generate_from_frequencies(wordsList)

# Display the word cloud.
plt.figure(figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.title("Word Cloud")
plt.show()

# now we drop words from the other list for the bar chart
while (len(sortedWordsList) > wordsForGraph):
    sortedWordsList.pop()

for word in sortedWordsList:
    print(word)
# Create a bar chart.
plt.figure(figsize=(20, 3))
plt.bar([word for word, count in sortedWordsList],
        [count for word, count in sortedWordsList])
plt.xlabel("Word")
plt.ylabel("Frequency")
plt.title("Word Frequency Chart")
plt.show()


ModuleNotFoundError: No module named 'bs4'