Overall goal:

Train a model to correctly classify an article as Left Leaning, Right Leaning, or Centered based on its headline, sample text, and publisher.

Collect data from articles found on Ground News, including left, right, and center labels to be used for training. 

In [1]:
# Import libs
from bs4 import BeautifulSoup
import requests
import time
import datetime

import pandas as pd

import csv


In [2]:
# define url
url = "http://ground.news"

# Get user agent from 'https://httpbin.org/get'
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.5",
}

# collect html as soup
page = requests.get(url, headers=headers)
html = page.content
soup = BeautifulSoup(html)

# extract article information using css selector for precision
# titles = soup.find_all(class_="text-22 leading-10 line-clamp-3")
stories = soup.select(
    "div.w-full.flex.justify-between.gap-1 > h4.text-22.leading-10.line-clamp-3"
)
topics = soup.select(
    "div.flex.flex-col.gap-8px.justify-center > span.text-12.leading-6"
)
locations = soup.select(
    "div.flex.flex-col.gap-8px.justify-center > span.text-12.leading-6 > span"
)

# left_covs = soup.select('div.flex.items-center.gap-1.false > div > div > div.text-light-primary.bg-secondary-left.text-light-primary.leading-none.flex.items-center')
# center_covs = soup.select('div.flex.items-center.gap-1.false > div > div > div.text-dark-primary.bg-secondary-neutral.leading-none.flex.items-center')
# right_covs = soup.select('div.flex.items-center.gap-1.false > div > div > div.text-light-primary.bg-secondary-right.text-light-primary.leading-none.flex.items-center')

coverage_splits = soup.select("div.flex.items-center.gap-1.false > div > div")

maj_cov_n_sources = soup.select(
    "div.flex.items-center.gap-1.false > div.text-12.leading-6 > span"
)

link_results = soup.select("a.absolute.left-0.right-0.top-0.bottom-0")

today = datetime.date.today()


In [3]:
# function that takes a bs4.element.ResultSet and returns list of the extracted text strings
def extract_text(result_set):
    new_list = []

    for tag in result_set:
        new_list.append(tag.get_text())

    return new_list


In [4]:
# call extract_text on titles, topics, and locations
stories_str = extract_text(stories)
topics_str = extract_text(topics)
locations_str = extract_text(locations)
coverage_sources_str = extract_text(maj_cov_n_sources)
# maj_coverage = extract_text(maj_cov_n_sources).split(':')[0]
# n_sources = extract_text(maj_cov_n_sources).split(':')[1]


In [5]:
# function to extract each href link from lin_results
def extract_link(link_results):
    link_list = []

    for link in link_results:
        full_link = url + link["href"]
        link_list.append(full_link)
    return link_list


In [6]:
# extract links for each article
story_links = extract_link(link_results)

story_links


['http://ground.news/article/salman-rushdie-articulate-after-stabbing-attack-responding-to-investigators-report_80a45e',
 'http://ground.news/article/live-news-updates-amazon-accuses-ftc-of-harassing-executives-bezos-and-jassy',
 'http://ground.news/article/us-forgives-39-bln-in-federal-loans-for-itt-tech-students_13b0fd',
 'http://ground.news/article/first-lady-jill-biden-tests-positive-for-covid-19_e1caea',
 'http://ground.news/article/random-text-message-to-flagler-county-commissioner-ends-with-suspected-drug-dealer-arrest_06e4a9',
 'http://ground.news/article/cnbc-chairman-mark-hoffman-to-step-down-in-september',
 'http://ground.news/article/nba-no-games-on-election-day_dc434a',
 'http://ground.news/article/4moms-recalls-some-mamaroo-swings-rockaroo-rockers-over-strangulation-hazard',
 'http://ground.news/article/nba-no-games-on-election-day_dc434a',
 'http://ground.news/article/4moms-recalls-some-mamaroo-swings-rockaroo-rockers-over-strangulation-hazard',
 'http://ground.news/arti

In [7]:
# function that iterates through coverage_splits, extracting the left,right, and center percentages if present
# if the left, right, or center coverage not found, exract "0%"


def extract_coverages():
    # initialize empty lists for left, right, and center
    left = []
    center = []
    right = []
    # for each entry in coverage_splits
    for split in coverage_splits:
        # check if left, then enter left or 0
        if split.find(
            class_="text-light-primary bg-secondary-left text-light-primary leading-none flex items-center"
        ):
            left.append(
                split.find(
                    class_="text-light-primary bg-secondary-left text-light-primary leading-none flex items-center"
                )["style"][6:9]
            )
        else:
            left.append("0%")
        # check if center, then enter left or 0
        if split.find(
            class_="text-dark-primary bg-secondary-neutral leading-none flex items-center"
        ):
            center.append(
                split.find(
                    class_="text-dark-primary bg-secondary-neutral leading-none flex items-center"
                )["style"][6:9]
            )
        else:
            center.append("0%")
        # check if right, then enter left or 0
        if split.find(
            class_="text-light-primary bg-secondary-right text-light-primary leading-none flex items-center"
        ):
            right.append(
                split.find(
                    class_="text-light-primary bg-secondary-right text-light-primary leading-none flex items-center"
                )["style"][6:9]
            )
        else:
            right.append("0%")
    # return left, center, right
    return left, center, right


In [8]:
# call function to extract coverage percentages from each political perspective
left, center, right = extract_coverages()


In [9]:
# put data in dataframe
df = pd.DataFrame()
df['stories'] = stories_str
df['topics'] = topics_str
df['locations'] = locations_str
df['left'] = left
df['center'] = center
df['right'] = right
df['coverage_sources'] = coverage_sources_str
df['story_links'] = story_links

# df['date'] = today


In [10]:
df


Unnamed: 0,stories,topics,locations,left,center,right,coverage_sources,story_links
0,Salman Rushdie is awake and 'articulate' after...,Books · New York,· New York,14%,79%,7%,79% Center coverage: 14 sources,http://ground.news/article/salman-rushdie-arti...
1,Amazon Accuses Regulators of Harassing Jeff Bezos,Jeff Bezos · Seattle,· Seattle,10%,60%,30%,60% Center coverage: 10 sources,http://ground.news/article/live-news-updates-a...
2,Education Dept. discharges $3.9 billion of stu...,Education · Washington,· Washington,16%,79%,5%,79% Center coverage: 44 sources,http://ground.news/article/us-forgives-39-bln-...
3,First lady Jill Biden tests positive for COVID-19,White House · Washington,· Washington,28%,51%,21%,51% Center coverage: 158 sources,http://ground.news/article/first-lady-jill-bid...
4,Random text message to Flagler County Commissi...,Cocaine · Flagler County,· Flagler County,0%,67%,33%,67% Center coverage: 6 sources,http://ground.news/article/random-text-message...
5,CNBC Chairman Mark Hoffman to step down,Business · New York,· New York,39%,44%,17%,44% Center coverage: 18 sources,http://ground.news/article/cnbc-chairman-mark-...
6,NBA: No games on Election Day,US & Canada · Washington,· Washington,34%,60%,6%,60% Center coverage: 35 sources,http://ground.news/article/nba-no-games-on-ele...
7,2 million infant swings recalled over possible...,Business · New York,· New York,25%,65%,10%,65% Center coverage: 72 sources,http://ground.news/article/4moms-recalls-some-...
8,NBA: No games on Election Day,US & Canada · Washington,· Washington,34%,60%,6%,60% Center coverage: 35 sources,http://ground.news/article/nba-no-games-on-ele...
9,2 million infant swings recalled over possible...,Business · New York,· New York,25%,65%,10%,65% Center coverage: 72 sources,http://ground.news/article/4moms-recalls-some-...


In [11]:
# COLLECT ARTICLE TEXT FROM EACH LINK

# def new Beautiful soup function to use on each article's link


In [12]:
# START CLEANING DATA
df["majority_coverage"] = (
    df["coverage_sources"].str.split(":").str[0].str.strip().str.split(" ").str[1]
)
df["n_sources"] = (
    df["coverage_sources"].str.split(":").str[1].str.strip().str.split(" ").str[0]
)
df = df.drop(columns="coverage_sources")

df["locations"] = df["locations"].str.replace("· ", "")
df["topics"] = df["topics"].str.split("·").str[0].str.strip()


In [13]:
df


Unnamed: 0,stories,topics,locations,left,center,right,story_links,majority_coverage,n_sources
0,Salman Rushdie is awake and 'articulate' after...,Books,New York,14%,79%,7%,http://ground.news/article/salman-rushdie-arti...,Center,14
1,Amazon Accuses Regulators of Harassing Jeff Bezos,Jeff Bezos,Seattle,10%,60%,30%,http://ground.news/article/live-news-updates-a...,Center,10
2,Education Dept. discharges $3.9 billion of stu...,Education,Washington,16%,79%,5%,http://ground.news/article/us-forgives-39-bln-...,Center,44
3,First lady Jill Biden tests positive for COVID-19,White House,Washington,28%,51%,21%,http://ground.news/article/first-lady-jill-bid...,Center,158
4,Random text message to Flagler County Commissi...,Cocaine,Flagler County,0%,67%,33%,http://ground.news/article/random-text-message...,Center,6
5,CNBC Chairman Mark Hoffman to step down,Business,New York,39%,44%,17%,http://ground.news/article/cnbc-chairman-mark-...,Center,18
6,NBA: No games on Election Day,US & Canada,Washington,34%,60%,6%,http://ground.news/article/nba-no-games-on-ele...,Center,35
7,2 million infant swings recalled over possible...,Business,New York,25%,65%,10%,http://ground.news/article/4moms-recalls-some-...,Center,72
8,NBA: No games on Election Day,US & Canada,Washington,34%,60%,6%,http://ground.news/article/nba-no-games-on-ele...,Center,35
9,2 million infant swings recalled over possible...,Business,New York,25%,65%,10%,http://ground.news/article/4moms-recalls-some-...,Center,72


Each story has multiple articles from which I can scrape the article title, source, sample text, and political lean.

In [16]:
import re

# create function to iterate through all of the articles found for each story
def scrape_story(df):
    # create new df to hold all new data
    new_df = pd.DataFrame()

    # iterate over each story
    for i, story in df.iterrows():

        # create new beautiful soup to scrape data
        page = requests.get(story["story_links"], headers=headers)
        html = page.content
        soup = BeautifulSoup(html)

        # # scrape the desired data
        article_titles = soup.select("h4.text-22.leading-11")
        article_sources = soup.select(
            "div.flex.gap-8px.items-center.text-14.flex-wrap > a > div > span"
        )
        text_samples = soup.select("p.font-normal.text-18.leading-9.break-words")

        # article_leans = soup.select(
        #     "div.flex gap-8px.items-center.text-14.flex-wrap > button:nth-of-type(1)"
        # )

        article_lean_wrappers = soup.select('div.flex.gap-8px.items-center.text-14.flex-wrap')

        # clean soup into usable data
        article_titles_str = extract_text(article_titles)
        article_sources_str = extract_text(article_sources)
        text_samples_str = extract_text(text_samples)

        # print(len(article_titles_str))
        # print(len(article_sources_str))
        # print(len(text_samples_str))

        # extract lean from article_leans
        # start with empty list of strings
        leans = []
        
        # iterate through article_leans, checking classes to determine lean and append to list
        for lean in article_lean_wrappers:
            if lean.find(
                class_=re.compile('secondary-left')#"py-1/2 rounded-4px text-12 justify-self-start leading-6 whitespace-nowrap flex flex-shrink items-center text-light-primary dark:text-light-primary  disabled:opacity-50 bg-secondary-left text-light-primary px-4px"
            ):
                leans.append("left")
            elif lean.find(
                class_=re.compile('secondary-neutral')#"py-1/2 rounded-4px text-12 justify-self-start leading-6 whitespace-nowrap flex flex-shrink items-center text-light-primary dark:text-light-primary  disabled:opacity-50 bg-secondary-neutral text-ground-black dark:bg-gray-100 dark:text-light-primary px-4px"
            ):
                leans.append("center")
            elif lean.find(
                class_=re.compile('secondary-right')#"py-1/2 rounded-4px text-12 justify-self-start leading-6 whitespace-nowrap flex flex-shrink items-center text-light-primary dark:text-light-primary  disabled:opacity-50 bg-secondary-right text-light-primary px-4px"
            ):
                leans.append("right")
            else:
                leans.append('None')

        # print(leans)

        # for each article, create a new row and append all relevant data to new_df
        for i, title in enumerate(article_titles_str):

            # getting duplicates, so skip half
            if i % 2 == 0:

                # create df row to hold article data and its associated story data
                article = pd.DataFrame(story).transpose()

                # add article data to the row
                article["title"] = title
                article["source"] = article_sources_str[i]
                article["sample"] = text_samples_str[i // 2]
                article['lean'] = leans[i]
                article['date'] = today

                # append the row to new df
                new_df = pd.concat([new_df, article])

    # return new dataframe
    return new_df  # .transpose()

In [17]:
article_df = scrape_story(df)

article_df

Unnamed: 0,stories,topics,locations,left,center,right,story_links,majority_coverage,n_sources,title,source,sample,lean,date
0,Salman Rushdie is awake and 'articulate' after...,Books,New York,14%,79%,7%,http://ground.news/article/salman-rushdie-arti...,Center,14,Salman Rushdie is awake and 'articulate' after...,CTV News,Award-winning author Salman Rushdie is awake a...,center,2022-08-16
0,Salman Rushdie is awake and 'articulate' after...,Books,New York,14%,79%,7%,http://ground.news/article/salman-rushdie-arti...,Center,14,Salman Rushdie is awake and 'articulate' after...,CNN,Award-winning author Salman Rushdie is awake a...,left,2022-08-16
0,Salman Rushdie is awake and 'articulate' after...,Books,New York,14%,79%,7%,http://ground.news/article/salman-rushdie-arti...,Center,14,Salman Rushdie is awake and 'articulate' after...,KVIA,Bianna Golodryga speaks to president of the Co...,center,2022-08-16
0,Salman Rushdie is awake and 'articulate' after...,Books,New York,14%,79%,7%,http://ground.news/article/salman-rushdie-arti...,Center,14,Salman Rushdie is awake and 'articulate' after...,KRDO,"By Aya Elamroussi and Mark Morales, CNNAward-w...",center,2022-08-16
0,Salman Rushdie is awake and 'articulate' after...,Books,New York,14%,79%,7%,http://ground.news/article/salman-rushdie-arti...,Center,14,Salman Rushdie is awake and 'articulate' after...,KIFI,"By Aya Elamroussi and Mark Morales, CNN Award-...",center,2022-08-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,Six people wounded in shooting outside Tenness...,Gun Violence,Memphis,37%,42%,21%,http://ground.news/article/six-people-wounded-...,Center,19,Memphis hospital locked down after shooting ne...,The Independent,Shooting took place near emergency room of the...,left,2022-08-16
15,Six people wounded in shooting outside Tenness...,Gun Violence,Memphis,37%,42%,21%,http://ground.news/article/six-people-wounded-...,Center,19,"Memphis hospital locks down, treats shooting v...",USA Today,A hospital was on lockdown early Tuesday while...,left,2022-08-16
15,Six people wounded in shooting outside Tenness...,Gun Violence,Memphis,37%,42%,21%,http://ground.news/article/six-people-wounded-...,Center,19,6 shot outside Memphis hospital,abc News,Six people were shot outside a Memphis hospita...,left,2022-08-16
15,Six people wounded in shooting outside Tenness...,Gun Violence,Memphis,37%,42%,21%,http://ground.news/article/six-people-wounded-...,Center,19,"6 people shot near a hospital in Memphis, Tenn...",CNN,Six people were shot early Tuesday near Method...,left,2022-08-16


All the code is here. Just formalize into functions, and add auto fetch ability to write to csv file