## Step by step web-scraping of "Media Bias" data from https://www.allsides.com
### Tools used: python, requests, beautifulsoup

In [1]:
import requests
from bs4 import BeautifulSoup

Request the first page

In [2]:
url = 'https://www.allsides.com/media-bias/media-bias-ratings'
r = requests.get(url)
print(r.content[:100])

b'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte'


Create BeautifulSoup object

In [4]:
soup = BeautifulSoup(r.content, 'html.parser')
soup

<!DOCTYPE html>

<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->
<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->
<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html dir="ltr" lang="en" prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book# profile: http://ogp.me/ns/profile# video: http://ogp.me/ns/video# product: http://ogp.me/ns/product# content: http://purl.org/rss/1.0/modules/content/ dc: http://purl.org/dc/terms/ foaf: http://xmlns.com/foaf/0.1/ rdfs: http://www.w3.org/2000/01/rdf-schema# sioc: http://rdfs.org/sioc/ns# sioct: http://rdfs.org/sioc/types# skos: http://www.w3.org/2004/02/skos/core# xsd: http://www.w3.org/2001/XMLSchema#"><!--<![endif]-->
<head profile="http://www.w3.org/1999/xhtml/vocab">
<meta charset="utf-8"/>
<me

Finding elements and data from Soup object

In [12]:
# To get each row of table, we just select all <tr> inside <tbody>
rows = soup.select('tbody tr')
rows

[<tr class="odd views-row-first">
 <td class="views-field views-field-title source-title">
 <a href="/news-source/abc-news-media-bias">ABC News</a> </td>
 <td class="views-field views-field-field-bias-image">
 <a href="/media-bias/left-center"><img alt="Political News Media Bias Rating: Lean Left" height="24" src="https://www.allsides.com/sites/default/files/styles/bias144x24/public/bias-leaning-left.png?itok=mtG3ToEN" title="Political News Media Bias Rating: Lean Left" typeof="foaf:Image" width="144"/></a> </td>
 <td class="views-field views-field-nothing-1 what-do-you-think">
 <div class="agree-disagree-widget"><div class="rate-widget-4 rate-widget clear-block rate-average rate-widget-yesno rate-f13adc4a6caee57d894f9d7ef192d7f3 rate-node-76-4-1" id="rate-node-76-4-1">
 <div class="item-list"><ul><li class="first"><a class="rate-button rate-btn" href="/media-bias/media-bias-ratings?rate=zmyDLixxBj_ytZsb_p9D32b0gPWtDEaeOSu_YIdkTIc" id="rate-button-3" rel="nofollow" title="agree">agree<

New Source Name

In [31]:
row = rows[0]
name = row.select_one('.source-title').text.strip()
print(name)

ABC News


News source page link

In [32]:
allsides_page = row.select_one('.source-title a')['href']
allsides_page = 'https://www.allsides.com' + allsides_page
print(allsides_page)

https://www.allsides.com/news-source/abc-news-media-bias


Bias rating

In [33]:
bias = row.select_one('.views-field-field-bias-image a')['href']
bias = bias.split('/')[-1]
print(bias)

left-center


Community Feedback Data

In [34]:
agree = row.select_one('.agree').text
agree = int(agree)

disagree = row.select_one('.disagree').text
disagree = int(disagree)

agree_ratio = agree / disagree

print(f"Agree: {agree}, Disagree: {disagree}, Ratio {agree_ratio:.2f}")

Agree: 10374, Disagree: 7715, Ratio 1.34


In [4]:
def get_agreeance_text(ratio):
    if ratio > 3: return "absolutely agrees"
    elif 2 < ratio <= 3: return "strongly agrees"
    elif 1.5 < ratio <= 2: return "agrees"
    elif 1 < ratio <= 1.5: return "somewhat agrees"
    elif ratio == 1: return "neutral"
    elif 0.67 < ratio < 1: return "somewhat disagrees"
    elif 0.5 < ratio <= 0.67: return "disagrees"
    elif 0.33 < ratio <= 0.5: return "strongly disagrees"
    elif ratio <= 0.33: return "absolutely disagrees"
    else: return None
    
print(get_agreeance_text(2.5))

strongly agrees


So far we explored data extraction for single row, now let's loop and pull all data

In [36]:
data= []

for row in rows:
    d = dict()
    
    d['name'] = row.select_one('.source-title').text.strip()
    d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
    d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
    d['agree'] = int(row.select_one('.agree').text)
    d['disagree'] = int(row.select_one('.disagree').text)
    d['agree_ratio'] = d['agree'] / d['disagree']
    d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])
    
    data.append(d)
    
print(data[3])

{'name': 'American Spectator', 'allsides_page': 'https://www.allsides.com/news-source/american-spectator', 'bias': 'right', 'agree': 5324, 'disagree': 2255, 'agree_ratio': 2.3609756097560974, 'agreeance_text': 'strongly agrees'}


## Putting it all together
Requesting and parsing multiple pages

In [2]:
pages = [
    'https://www.allsides.com/media-bias/media-bias-ratings',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=1',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=2'
]

Our loop will:
* request a page
* parse the page
* wait ten seconds (according to AllSides' robots.txt we need to make sure we wait ten seconds before each request)
* repeat for next page

In [5]:
from time import sleep

data= []

for page in pages:
    r = requests.get(page)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    rows = soup.select('tbody tr')

    for row in rows:
        d = dict()

        d['name'] = row.select_one('.source-title').text.strip()
        d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
        d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
        d['agree'] = int(row.select_one('.agree').text)
        d['disagree'] = int(row.select_one('.disagree').text)
        d['total_votes'] = int(d['agree'] + d['disagree'])
        d['agree_ratio'] = d['agree'] / d['disagree']
        d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])

        data.append(d)
    
    sleep(10)
    
print("Total records fetched =", len(data))

Total records featched = 73


Fetch website name of each news source

In [6]:
from tqdm import tqdm_notebook

for d in tqdm_notebook(data):
    r = requests.get(d['allsides_page'])
    soup = BeautifulSoup(r.content, 'html.parser')
    
    try:
        website = soup.select_one('.www')['href']
        d['website'] = website
    except TypeError:
        pass
    
    sleep(10)

print("Total records fetched =", len(data))

HBox(children=(IntProgress(value=0, max=73), HTML(value='')))


Total records fetched = 73


Save the data

In [7]:
import json

with open('allsides.json', 'w') as f:
    json.dump(data, f)
    

# Test if all records were daved
with open('allsides.json', 'r') as f:
    data = json.load(f)
print("Total records fetched =", len(data))

Total records fetched = 73
