In [78]:
import pandas as pd

from queue import Queue
from requests import get

from bs4 import BeautifulSoup

In [79]:
class Crawler:
    def __init__(self, seed_url: str):
        self.q = Queue()
        self.q.put(seed_url)
        self.visited_urls = set()
        self.error_urls = set()
        self.visited_pages = {}
        self.seed_url = seed_url

    def crawl(self, num_pages: int|None=50):
        num_crawled = 0
        while not self.q.empty() and num_crawled < num_pages:
            url = self.q.get()
            try:
                resp = get(url).content
                self.visited_urls.add(url)
                page = BeautifulSoup(resp, "html.parser")
                self.visited_pages[url] = page
                print(f"visited: {url}")
                
                # adding links to the queue
                links = {a['href'] for a in page.find_all('a')}
                for link in links:
                    if not link.startswith('http') and not link.startswith('#'):
                        link = f'{url}{link}'
                        if link not in self.visited_urls:
                            self.q.put(link)
                num_crawled += 1
            except:
                self.error_urls.add(url)
                continue
    
    def reset_seed(self, url: str|None):
        self.q = Queue()
        if url is not None:
            self.seed_url = url
            self.q.put(url)

In [80]:
c = Crawler('https://nymag.com')
c.crawl(10)

visited: https://nymag.com
visited: https://nymag.com//nymag.com/intelligencer/2023/02/tim-scott-2024-iowa-pitch-gop-dominance.html
visited: https://nymag.com//www.vulture.com/2023/02/selena-gomez-social-media-break-tiktok-drama-hailey-bieber-kylie-jenner.html
visited: https://nymag.com//thecut.com/culture
visited: https://nymag.com//www.vulture.com/article/the-real-housewives-of-miami-season-5-episode-15-recap-lines-in-the-sand.html
visited: https://nymag.com//www.curbed.com/2023/02/elon-musk-tesla-master-plan-investor-day.html
visited: https://nymag.com//www.grubstreet.com/article/how-much-to-tip-new-etiquette-rules.html
visited: https://nymag.com//www.thecut.com/2023/02/harvey-weinstein-l-a-trial-what-to-know.html
visited: https://nymag.com//www.thecut.com/2023/02/kylie-jenner-postpartum-depression.html
visited: https://nymag.com//www.vulture.com/article/jerry-saltz-moma-refik-anadol-unsupervised.html


In [81]:
c.reset_seed('https://en.wikipedia.org/wiki/Main_Page')
c.crawl(10)

visited: https://en.wikipedia.org/wiki/Main_Page
visited: https://en.wikipedia.org/wiki/Main_Page/wiki/Wikipedia:Today%27s_featured_list/February_2023


In [82]:
def alt_tag_percentage(bso) -> float:
    img_tags = bso.find_all('img')
    img_tags_with_alt = [tag for tag in img_tags if tag.has_attr('alt')]
    return len(img_tags_with_alt) / len(img_tags) if len(img_tags) > 0 else 1

In [83]:
def aria_label_links_percentage(bso) -> float:
    a_tags = bso.find_all('a')
    a_tags_with_aria_label = [tag for tag in a_tags if tag.has_attr('aria-label')]
    return len(a_tags_with_aria_label) / len(a_tags) if len(a_tags) > 0 else 1

In [84]:
results = pd.DataFrame()
for i, (url, page) in enumerate(c.visited_pages.items()):
    results.loc[i, 'url'] = url
    results.loc[i, 'alt_text_percentage'] = alt_tag_percentage(page)
    results.loc[i, 'aria_label_percentage'] = aria_label_links_percentage(page)
results

Unnamed: 0,url,alt_text_percentage,aria_label_percentage
0,https://nymag.com,1.0,0.003058
1,https://nymag.com//nymag.com/intelligencer/202...,0.333333,0.0
2,https://nymag.com//www.vulture.com/2023/02/sel...,0.333333,0.0
3,https://nymag.com//thecut.com/culture,0.333333,0.0
4,https://nymag.com//www.vulture.com/article/the...,0.333333,0.0
5,https://nymag.com//www.curbed.com/2023/02/elon...,0.333333,0.0
6,https://nymag.com//www.grubstreet.com/article/...,0.333333,0.0
7,https://nymag.com//www.thecut.com/2023/02/harv...,0.333333,0.0
8,https://nymag.com//www.thecut.com/2023/02/kyli...,0.333333,0.0
9,https://nymag.com//www.vulture.com/article/jer...,0.333333,0.0


In [85]:
results.to_csv('accessibility.csv')