## Coursera Instructor Scraper

Sends requests to Coursera and parses out instructor information.

In [128]:
import requests
import pandas as pd
import logging
import time
import os

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [129]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

output_log_file_name = f"{'coursera-instructors'}-{time.strftime('%Y%m%d-%H%M%S')}.log"
file_path = os.path.join(os.getcwd(), "logs", output_log_file_name)
fh = logging.FileHandler(file_path)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

In [130]:
all_instructor_sitemap_link = "https://www.coursera.org/sitemap~www~instructors.xml"
request = requests.get(all_instructor_sitemap_link)
soup = BeautifulSoup(request.content, "xml")
instructor_links = soup.find_all("loc")
instructor_pages = [link.text for link in instructor_links]
df = pd.DataFrame(instructor_pages, columns=["instructor_page"])


In [131]:
def parse_instructor_page(url):
    """Parse instructor page and return a dictionary of instructor info.
    
    Parameters
    ----------
    url : str
        Instructor page url.
        
    Returns
    -------
    dict
        Dictionary of instructor info. 
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
    }
    request = requests.get(url, headers=headers)
    soup = BeautifulSoup(request.content, "html.parser")

    instructor_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-119.cds-132 > h2"
    instructor_subheading = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-119.cds-132 > p"
    instructor_external_link_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-119.cds-132 > ul > li > a"
    instructor_image_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-117.cds-129 > img"
    instructor_courses_wrapper_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-t-2.m-t-2.border-top.css-0.cds-64 > div > div"


    instructor_name = soup.select_one(instructor_selector)
    instructor_name = instructor_name.text if instructor_name else None

    instructor_subheading = soup.select_one(instructor_subheading)
    instructor_subheading = instructor_subheading.text if instructor_subheading else None

    instructor_external_link_href = soup.select_one(instructor_external_link_selector)
    instructor_external_link_href = instructor_external_link_href.get("href") if instructor_external_link_href else None

    instructor_external_link_text = soup.select_one(instructor_external_link_selector)
    instructor_external_link_text = instructor_external_link_text.text if instructor_external_link_text else None

    instructor_image_src = soup.select_one(instructor_image_selector)
    instructor_image_src = instructor_image_src.get("src") if instructor_image_src else None

    course_link_selector = "#instructors-course-card"
    course_name_selector = "#instructors-course-card > div > div._10ytpvl5 > h4 > div > div"
    instructor_courses = []
    for course in soup.select_one(instructor_courses_wrapper_selector).children:
        course_name = course.select_one(course_name_selector).text
        course_link = course.select_one(course_link_selector).get("href")
        course_slug = course_link.split("/")[-1]
        course_slug = course_slug.split("?")[0]
        instructor_courses.append({"course_name": course_name, "course_slug": course_slug})

    return {
        "instructor_name": instructor_name,
        "instructor_subheading": instructor_subheading,
        "instructor_external_link_href": instructor_external_link_href,
        "instructor_external_link_text": instructor_external_link_text,
        "instructor_image_src": instructor_image_src,
        "instructor_courses": instructor_courses,
        "url": url,
    }


In [132]:
def pivot_to_course(instructor_info):
    """Convert data to list where each element is a dictionary of info for each course.
    
    Parameters
    ----------
    instructor_info : dict
        Dictionary of instructor info.
        
    Returns
    -------
    list
        List of course info dictionaries.
    """
    instructor_courses = []
    for course in instructor_info["instructor_courses"]:
        instructor_courses.append(
            {
                "instructor_name": instructor_info["instructor_name"],
                "instructor_subheading": instructor_info["instructor_subheading"],
                "instructor_external_link_href": instructor_info["instructor_external_link_href"],
                "instructor_external_link_text": instructor_info["instructor_external_link_text"],
                "instructor_image_src": instructor_info["instructor_image_src"],
                "instructor_page_url": instructor_info["url"],
                "course_name": course["course_name"],
                "course_slug": course["course_slug"],
            }
        )
    return instructor_courses

In [133]:
def tqdm_threadpool_map(func, iterable, no_threads, iterable_length, *args):
    """A threadpool map function that shows a progress bar.

    Parameters
    ----------
    func : function
        The function to apply to each element of the iterable.

    iterable : iterable
        The iterable to apply the function to.

    no_threads : int
        The number of threads to use.

    iterable_length : int
        The length of the iterable.

    *args : list
        The list of arguments to pass to the function.

    Returns
    -------
    results : list
        The list of results from the function.
    """
    with ThreadPoolExecutor(max_workers=no_threads) as executor:
        results = list(tqdm(executor.map(func, iterable, *args), total=iterable_length))
    return results

In [134]:
def get_instructor_info(url):
    """Get instructor info from a url.

    Parameters
    ----------
    url : str
        The url to scrape.

    Returns
    -------
    instructor_info : list
        List of course info dictionaries.
    """
    try:
        instructor_info = parse_instructor_page(url)
        instructor_info = pivot_to_course(instructor_info)
        return instructor_info
    except Exception as e:
        logging.error(f"Error getting instructor info from {url}: {e}")
        return []


In [135]:
# # use multithreading to get all the insturctor info
# iterable = df["instructor_page"].tolist()
# no_threads = 10
# iterable_length = len(iterable)
# instructor_info = tqdm_threadpool_map(get_instructor_info, iterable, no_threads, iterable_length)

links = df["instructor_page"].tolist()
instructor_info = []
for link in tqdm(links):
    logging.info(f"Getting instructor info from {link}")
    info = get_instructor_info(link)
    instructor_info.extend(info)


    


  6%|▌         | 309/5466 [02:32<38:25,  2.24it/s]  ERROR:root:Error getting instructor info from https://www.coursera.org/instructor/~28762407: 'NoneType' object has no attribute 'children'
  9%|▊         | 466/5466 [03:53<36:31,  2.28it/s]  ERROR:root:Error getting instructor info from https://www.coursera.org/instructor/38926: 'NoneType' object has no attribute 'children'
 15%|█▍        | 793/5466 [06:20<33:40,  2.31it/s]  ERROR:root:Error getting instructor info from https://www.coursera.org/instructor/unity3d: 'NoneType' object has no attribute 'children'
 20%|█▉        | 1089/5466 [08:58<43:26,  1.68it/s]  ERROR:root:Error getting instructor info from https://www.coursera.org/instructor/~1702251: 'NoneType' object has no attribute 'children'
 48%|████▊     | 2637/5466 [23:02<25:03,  1.88it/s]  ERROR:root:Error getting instructor info from https://www.coursera.org/instructor/podolskii: 'NoneType' object has no attribute 'children'
100%|██████████| 5466/5466 [48:19<00:00,  1.89it/s

In [137]:
instructor_info = pd.DataFrame(instructor_info)
instructor_info.to_csv("instructor_info.csv", index=False)