## Lab 3: Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.allsides.com/media-bias/media-bias-ratings'

r = requests.get(url)

soup = BeautifulSoup(r.content, 'html.parser')

In [4]:
# other option, looading in html file 
htmlfile = open("lab-3-data.html", "r") 
  
# Reading the file 
index = htmlfile.read() 
  
# Creating a BeautifulSoup object and specifying the parser 
soup = BeautifulSoup(index, 'html.parser')

In [5]:
rows = soup.select('tbody tr')

row = rows[0]
name = row.select_one('.source-title').text.strip()

print(name)

ABC News (Online)


In [6]:
allsides_page = row.select_one('.source-title a')['href']
allsides_page = 'https://allsides.com' + allsides_page

print(allsides_page)

https://allsides.com/news-source/abc-news-media-bias


In [7]:
bias = row.select_one('.views-field-field-bias-image a')['href']
bias = bias.split('/')[-1]

print(bias)

left-center


In [8]:
agree = row.select_one('.agree').text
agree = int(agree)

disagree = row.select_one('.disagree').text
disagree = int(disagree)

agree_ratio = agree / disagree

print(f"Agree: {agree}, Disagree: {disagree}, Ratio {agree_ratio:.2f}")

Agree: 46837, Disagree: 23302, Ratio 2.01


In [9]:
print(row.select_one('.community-feedback-rating-page'))

None


In [10]:
def get_agreeance_text(ratio):
    if ratio > 3: return "absolutely agrees"
    elif 2 < ratio <= 3: return "strongly agrees"
    elif 1.5 < ratio <= 2: return "agrees"
    elif 1 < ratio <= 1.5: return "somewhat agrees"
    elif ratio == 1: return "neutral"
    elif 0.67 < ratio < 1: return "somewhat disagrees"
    elif 0.5 < ratio <= 0.67: return "disagrees"
    elif 0.33 < ratio <= 0.5: return "strongly disagrees"
    elif ratio <= 0.33: return "absolutely disagrees"
    else: return None
    
print(get_agreeance_text(1))

neutral


In [11]:
data = []

for row in rows:
    d = dict()
    
    d['name'] = row.select_one('.source-title').text.strip()
    d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
    d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
    d['agree'] = int(row.select_one('.agree').text)
    d['disagree'] = int(row.select_one('.disagree').text)
    d['agree_ratio'] = d['agree'] / d['disagree']
    d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])
    
    data.append(d)

In [12]:
print(data[0])

{'name': 'ABC News (Online)', 'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias', 'bias': 'left-center', 'agree': 46837, 'disagree': 23302, 'agree_ratio': 2.009999141704575, 'agreeance_text': 'strongly agrees'}


This pages requires waiting 10 seconds between each request, hence using `sleep(10)`

In [None]:
# TODO add html for pages and parse through alternative way
pages = [
    'https://www.allsides.com/media-bias/media-bias-ratings',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=1',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=2'
]

from time import sleep

data = []

for page in pages:
    r = requests.get(page)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    rows = soup.select('tbody tr')

    for row in rows:
        d = dict()
        
        d['name'] = row.select_one('.source-title').text.strip()
        d['allsides_page'] = 'https://www.allsides.com' + row.select_one('.source-title a')['href']
        d['bias'] = row.select_one('.views-field-field-bias-image a')['href'].split('/')[-1]
        d['agree'] = int(row.select_one('.agree').text)
        d['disagree'] = int(row.select_one('.disagree').text)
        d['agree_ratio'] = d['agree'] / d['disagree']
        d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])

        data.append(d)
    
    sleep(10)

In [None]:
import pandas as pd

df = pd.DataFrame(data)

df.set_index('name', inplace=True)

df.head()

In [17]:
data

[]

### Exercises

In [None]:
import matplotlib.pyplot as plt

df['total_votes'] = df['agree'] + df['disagree']

df3 = df.copy()

fig = plt.figure(figsize=(15,15))

biases = df3['bias'].unique()

for i, bias in enumerate(biases):
    temp_df = df3[df3['bias'] == bias].iloc[:10]
    temp_df.sort_index(inplace=True)
    
    max_votes = temp_df['total_votes'].max()
    
    ax = fig.add_subplot(3, 2, i + 1)
    
    ax.bar(temp_df.index, temp_df['agree'], color='#5DAF83')
    ax.bar(temp_df.index, temp_df['disagree'], bottom=temp_df['agree'], color='#AF3B3B')
    
    for x, y, ratio in zip(ax.get_xticks(), temp_df['total_votes'], temp_df['agree_ratio']):
        ax.text(x, y + (0.02 * max_votes), f"{ratio:.2f}", ha='center')
    
    ax.set_ylabel('Total feedback')
    ax.set_title(bias.title())
    
    ax.set_ylim(0, max_votes + (0.12 * max_votes))
    
    plt.setp(ax.get_xticklabels(), rotation=30, ha='right')