In [2]:
import requests
import os
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_images(url):
    """
    Returns all image URLs on a single `url`
    """
    soup = bs(requests.get(url).content, "html.parser")
    
    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src")
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)
        
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        
        # finally, if the url is valid
        if is_valid(img_url):
            urls.append(img_url)
    return urls

def download(url, pathname):
    """
    Downloads a file given an URL and puts it in the folder `pathname`
    """
    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    response = requests.get(url, stream=True)
    # get the total file size
    file_size = int(response.headers.get("Content-Length", 0))
    # get the file name
    filename = os.path.join(pathname, url.split("/")[-1])
    # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
    progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, "wb") as f:
        for data in progress.iterable:
            # write data read to the file
            f.write(data)
            # update the progress bar manually
            progress.update(len(data))

"""
 "main" is the function that is actually scraping and downloading the images off a specified website
 input: a url and a directory path
 output/result: images are downloaded into specific folder
 everything above is used in main
"""           
def main(url, path):
    # get all images
    imgs = get_all_images(url)
    for img in imgs:
        # for each image, download it
        download(img, path)

"""
example of how to run main to download images      
main("https://www.csulb.edu/college-of-education", "C:\\Users\\hanse\\Documents\\Github\\Algorithmic-Bias\\Web-Scraping\\image-folder")
"""

'\nexample of how to run main to download images      \nmain("https://www.csulb.edu/college-of-education", "C:\\Users\\hanse\\Documents\\Github\\Algorithmic-Bias\\Web-Scraping\\image-folder")\n'

In [3]:
from bs4 import BeautifulSoup
import requests

def get_embedded_url(url):
    """
    extracts all links embedded in 'url' that contain "college-of-education"
    stores the links in the list 'observed_links'
    """
    page = requests.get(url)    
    data = page.text
    soup = BeautifulSoup(data)

    # this is creating the list of urls that are embedded in college of ed homepage that we want to look into
    observed_links = []
    for link in soup.find_all('a'):
        if 'college-of-education' in link.get('href'):
            if 'https://www.csulb.edu' in link.get('href'):
                observed_links.append(link.get('href'))
            elif link.get('href')[:1] == '/':
                https_concat = 'https://www.csulb.edu'+(link.get('href'))
                observed_links.append(https_concat)
    observed_links = list(set(observed_links))
    return observed_links

In [4]:
observed_links = get_embedded_url("https://www.csulb.edu/college-of-education")

In [5]:
# display the list of links generated above, what does it "physically" look like?
display(observed_links)

# display the number of links that were extracted using get_embedded_url
display(len(observed_links))

# count the number of "valid" or executable links in the observed
count = 0
for i in observed_links:
    if is_valid(i):
        count += 1
count

['https://www.csulb.edu/college-of-education/teacher-education',
 'https://www.csulb.edu/college-of-education/assessment-office',
 'https://www.csulb.edu/college-of-education/become-a-counselor-or-psychologist',
 'https://www.csulb.edu/college-of-education/education-specialist-credential-program-escp',
 'https://www.csulb.edu/college-of-education/community-clinic-for-counseling-and-educational-services',
 'https://www.csulb.edu/college-of-education/liberal-studies',
 'https://www.csulb.edu/college-of-education/dual-language-development',
 'https://www.csulb.edu/college-of-education/overview-of-scholarships-financial-aid',
 'https://www.csulb.edu/college-of-education/multiple-subject-credential-program-mscp',
 'https://www.csulb.edu/college-of-education/faculty-and-staff-celebrated-at-convocation',
 'https://www.csulb.edu/college-of-education/equity-education-and-social-justice',
 'https://www.csulb.edu/college-of-education/counseling-psychology',
 'https://www.csulb.edu/college-of-educ

80

80

In [13]:
# for every url in "observed_links" download the images contained on that page and store it in "scraped_images" folder
for i in range(len(observed_links)):
    main(observed_links[i], os.path.join(os.getcwd(), "scraped_images"))

Extracting images: 100%|██████████| 11/11 [00:00<?, ?it/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_images\lb.svg: 100%|██████████| 3.94k/3.94k [00:00<00:00, 2.02MB/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_images\logo-footer.svg: 100%|██████████| 7.84k/7.84k [00:00<00:00, 5.35MB/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_images\mobile-footer-logo.png: 100%|██████████| 10.7k/10.7k [00:00<00:00, 11.0MB/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_images\wordmark-black.png: 100%|██████████| 14.4k/14.4k [00:00<00:00, 7.41MB/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_images\wordmark-black.png: 100%|██████████| 14.4k/14.4k [00:00<00:00, 9.80MB/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_images\banner_ced_teacer-alc-betina-class-11-b.jpg: 100%|██████████| 133k/133k [00:00<00:00, 2.50MB/s]
Downloading e:\github\algorithmic_bias\env\web scraping\scraped_i

In [14]:
# count the files in the folder that was the target of my downloads
dir_path = "scraped_images"
count = 0
# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        count += 1
print('File count:', count)

File count: 111


In [15]:
# observed_links_lvl2 is the nested list described above
observed_links_lvl2 = [None]*len(observed_links)
for i in range(len(observed_links)):
    observed_links_lvl2[i] = get_embedded_url(observed_links[i])

In [16]:
# count the number of links in the nested
for i in range(len(observed_links_lvl2)):
    print(len(observed_links_lvl2[i]))
    
display(observed_links_lvl2[:2])

78
67
65
83
78
80
79
68
85
65
80
79
82
80
94
70
65
68
65
77
66
65
73
75
72
65
70
81
67
79
65
97
73
67
73
72
66
91
79
65
79
71
66
79
67
69
69
77
71
67
79
77
86
98
78
82
80
77
72
68
72
85
76
65
66
71
69
68
73
67
73
79
79
65
79
84
65
69
65
188


[['https://www.csulb.edu/college-of-education/teacher-education',
  'https://www.csulb.edu/college-of-education/assessment-office',
  'https://www.csulb.edu/college-of-education/education-specialist-credential-program-escp',
  'https://www.csulb.edu/college-of-education/teacher-education/programs-to-become-a-teacher',
  'https://www.csulb.edu/college-of-education/community-clinic-for-counseling-and-educational-services',
  'https://www.csulb.edu/college-of-education/liberal-studies',
  'https://www.csulb.edu/college-of-education/dual-language-development',
  'https://www.csulb.edu/college-of-education/overview-of-scholarships-financial-aid',
  'https://www.csulb.edu/college-of-education/multiple-subject-credential-program-mscp',
  'https://www.csulb.edu/college-of-education/equity-education-and-social-justice',
  'https://www.csulb.edu/college-of-education/counseling-psychology',
  'https://www.csulb.edu/college-of-education/school-psychology',
  'https://www.csulb.edu/college-of-educa

In [17]:
# flatten observed_links2
from itertools import chain
lvl2_flat = list(chain.from_iterable(observed_links_lvl2))

# keep only unique urls
lvl2_flat_unique = list(set(lvl2_flat))

In [18]:
# inspect size of lists created above
display(len(lvl2_flat))
display(len(lvl2_flat_unique))

# count the number of "valid" or executable links in the observed
count = 0
for i in lvl2_flat_unique:
    if is_valid(i):
        count += 1
count

6039

747

747

In [19]:
len(list(set([observed_links, lvl2_flat_unique])))

TypeError: unhashable type: 'list'

In [20]:
from collections import Counter
to_be_downloaded = list((Counter(lvl2_flat_unique)-Counter(observed_links)).elements())
display(len(to_be_downloaded))

667