# Webscraping
All the webscrapping takes place in this code block. Webscraping is being done by Newspaper3k. Webscrapping errors on the side of scraping less of the article to avoid extra text such as ads and other article previews. Also filters out articles that contain repeated phrases (in the case that the article blocks webscrapping and we keep getting error messages), social media platforms, and articles that don't contain enough words.

In [1]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import pandas as pd
import requests
from newspaper import Article
import numpy as np

In [2]:
# Finds phrases that are repeated and returns them. This is used to avoid hangups on scraping,
# such as a webpage that blocks webscrapping.
def most_repeated_phrase_count(text):
    # Split the text into phrases (e.g., sentences)
    phrases = re.split(r'\.', text)

    # Remove leading and trailing spaces from each phrase
    phrases = [phrase.strip() for phrase in phrases if phrase.strip()]

    # Count the occurrences of each phrase using Counter
    phrase_counts = Counter(phrases)

    if not phrase_counts:
        return 0

    # Find the most common phrase and its count
    most_common_phrase, count = phrase_counts.most_common(1)[0]
    
    return count

# If the article doesn't contain enough words we get rid of the article
def filter_scrape_data(text):
    strLength = np.char.count(text, ' ') + 1
    if strLength < wordCountFilter or most_repeated_phrase_count(text) >= 6:
        print("word count " , strLength)
        print("repeated phrases", most_repeated_phrase_count(text))
        return False
    return True

# Removes articles that are a social media platform
def filter_social(url): # Filters social platforms that can't be scraped
    social_starts_with = ["https://www.youtube.com", "https://youtu.be", "https://www.facebook.com",
                          "https://twitter.com", "https://gettr.com/"]
    for y in social_starts_with:
        if(url.startswith(y)):
            return False
    return True

# Scraping takes place in this code block
def scrapeData(url):
    try:
        if not filter_social(url):
            return "PARERROR: SocialError"
        else:
            article = Article(url)
            article.download()
            article.parse()
            
            page_text = (article.text).lower()
            
            if not filter_scrape_data(page_text):
                print(url)
                return "PARERROR: WebBlockerError"
        
            page_text = page_text.strip().replace("  ","")
            page_text = "".join([s for s in page_text.splitlines(True) if s.strip("\r\n")])
            
    except:
        print("Error in scrapeData")
        print(url)
        page_text = "PARERROR: ErrorCouldntParse"
    return page_text