## Scrape Insturctor Images

Downloads images of instructors.

In [None]:
import pandas as pd
import requests
import os

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:
instructor_image_output_directory = "instructor-images"

In [None]:
def tqdm_threadpool_map(func, iterable, no_threads, iterable_length, *args):
    """A threadpool map function that shows a progress bar.

    Parameters
    ----------
    func : function
        The function to apply to each element of the iterable.

    iterable : iterable
        The iterable to apply the function to.

    no_threads : int
        The number of threads to use.

    iterable_length : int
        The length of the iterable.

    *args : list
        The list of arguments to pass to the function.

    Returns
    -------
    results : list
        The list of results from the function.
    """
    with ThreadPoolExecutor(max_workers=no_threads) as executor:
        results = list(tqdm(executor.map(func, iterable, *args), total=iterable_length, leave=False))
    return results

In [None]:
def download_image(url, output_path):
    """Download an image from a URL and saves it to file.
    
    Parameters
    ----------
    url : str
        The URL to download the image from.

    output_path : str
        The path to save the image to.
    """
    response = requests.get(url)
    if response.status_code == 200:
        filename = url.split('/')[-1].split('?')[0]
        with open(output_path, 'wb') as image:
            image.write(response.content)
    else:
        print(f'Error downloading image from URL: {url}')

    

In [None]:
def download_image_wrapper(args):
    """Wrapper function for downloading an image from a URL and saving it to file.
    
    Parameters
    ----------
    args : tuple
        The tuple of arguments to pass to the download_image function.
    """
    return download_image(*args)

In [None]:
data = pd.read_csv('instructor-content.csv')
data = data.dropna(subset=['instructor_image_src'])


In [None]:
if os.path.exists(instructor_image_output_directory) == False:
    os.mkdir(instructor_image_output_directory)

In [None]:
# create list of instructor ids using last part of instructor_page_url
instructor_ids = [url.split('/')[-1] for url in data['instructor_page_url']]
output_paths = [os.path.join(instructor_image_output_directory, instructor_id + '.jpg') for instructor_id in instructor_ids]

In [None]:
iterable = zip(data['instructor_image_src'].unique(), output_paths)
ret = tqdm_threadpool_map(download_image_wrapper, iterable, 16, len(data))


In [None]:
# add instructor_ids to data and save 
data["instructor_image_name"] = instructor_ids
data.to_csv("instructor-content-with-image-names.csv", index=False)