## Coursera Course With Instructor Scraper

Sends requests to Coursera and parses out course information including instructor information.

In [None]:
import requests
import pandas as pd
import logging
import time
import os

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

output_log_file_name = f"{'coursera-instructors-with-courses'}-{time.strftime('%Y%m%d-%H%M%S')}.log"
file_path = os.path.join(os.getcwd(), "logs", output_log_file_name)
fh = logging.FileHandler(file_path)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

In [None]:
all_instructor_sitemap_link = "https://www.coursera.org/sitemap~www~instructors.xml"
request = requests.get(all_instructor_sitemap_link)
soup = BeautifulSoup(request.content, "xml")
instructor_links = soup.find_all("loc")
instructor_pages = [link.text for link in instructor_links]
df = pd.DataFrame(instructor_pages, columns=["instructor_page"])

In [None]:
def parse_instructor_page(url):
    """Parse instructor page and return a dictionary of instructor info.
    
    Parameters
    ----------
    url : str
        Instructor page url.
        
    Returns
    -------
    dict
        Dictionary of instructor info. 
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
    }
    request = requests.get(url, headers=headers)
    soup = BeautifulSoup(request.content, "html.parser")

    instructor_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-119.cds-132 > h2"
    instructor_subheading = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-119.cds-132 > p"
    instructor_external_link_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-119.cds-132 > ul > li > a"
    instructor_image_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-b-2.css-0.cds-64 > div.cds-63.grid-item.css-0.cds-65.cds-grid-item.cds-110.cds-117.cds-129 > img"
    instructor_courses_wrapper_selector = "#rendered-content > div > div > div:nth-child(2) > div.max-width-980.m-t-3.m-b-0.m-x-auto.css-8hlpsv > div.cds-63.p-t-2.m-t-2.border-top.css-0.cds-64 > div > div"


    instructor_name = soup.select_one(instructor_selector)
    instructor_name = instructor_name.text if instructor_name else None

    instructor_subheading = soup.select_one(instructor_subheading)
    instructor_subheading = instructor_subheading.text if instructor_subheading else None

    instructor_external_link_href = soup.select_one(instructor_external_link_selector)
    instructor_external_link_href = instructor_external_link_href.get("href") if instructor_external_link_href else None

    instructor_external_link_text = soup.select_one(instructor_external_link_selector)
    instructor_external_link_text = instructor_external_link_text.text if instructor_external_link_text else None

    instructor_image_src = soup.select_one(instructor_image_selector)
    instructor_image_src = instructor_image_src.get("src") if instructor_image_src else None

    course_link_selector = "#instructors-course-card"
    course_name_selector = "#instructors-course-card > div > div._10ytpvl5 > h4 > div > div"
    instructor_courses = []

    # This also includes "projects", which we treat the same as courses.
    for course in soup.select_one(instructor_courses_wrapper_selector).children:
        course_name = course.select_one(course_name_selector).text
        course_link = course.select_one(course_link_selector).get("href")
        course_slug = course_link.split("/")[-1]
        course_slug = course_slug.split("?")[0]
        instructor_courses.append({"course_name": course_name, "course_slug": course_slug})

    return {
        "instructor_name": instructor_name,
        "instructor_subheading": instructor_subheading,
        "instructor_external_link_href": instructor_external_link_href,
        "instructor_external_link_text": instructor_external_link_text,
        "instructor_image_src": instructor_image_src,
        "instructor_courses": instructor_courses,
        "url": url,
    }


In [None]:
def pivot_to_course(instructor_info):
    """Convert data to list where each element is a dictionary of info for each course.
    
    Parameters
    ----------
    instructor_info : dict
        Dictionary of instructor info.
        
    Returns
    -------
    list
        List of course info dictionaries.
    """
    instructor_courses = []
    for course in instructor_info["instructor_courses"]:
        instructor_courses.append(
            {
                "instructor_name": instructor_info["instructor_name"],
                "instructor_subheading": instructor_info["instructor_subheading"],
                "instructor_external_link_href": instructor_info["instructor_external_link_href"],
                "instructor_external_link_text": instructor_info["instructor_external_link_text"],
                "instructor_image_src": instructor_info["instructor_image_src"],
                "instructor_page_url": instructor_info["url"],
                # Again, this also includes "projects", which we treat the same as courses.
                "course_name": course["course_name"],
                "course_slug": course["course_slug"],
            }
        )
    return instructor_courses

In [None]:
def tqdm_threadpool_map(func, iterable, no_threads, iterable_length, *args):
    """A threadpool map function that shows a progress bar.

    Parameters
    ----------
    func : function
        The function to apply to each element of the iterable.

    iterable : iterable
        The iterable to apply the function to.

    no_threads : int
        The number of threads to use.

    iterable_length : int
        The length of the iterable.

    *args : list
        The list of arguments to pass to the function.

    Returns
    -------
    results : list
        The list of results from the function.
    """
    with ThreadPoolExecutor(max_workers=no_threads) as executor:
        results = list(tqdm(executor.map(func, iterable, *args), total=iterable_length))
    return results

In [None]:
def get_instructor_info(url):
    """Get instructor info from a url.

    Parameters
    ----------
    url : str
        The url to scrape.

    Returns
    -------
    instructor_info : list
        List of course info dictionaries.
    """
    try:
        instructor_info = parse_instructor_page(url)
        instructor_info = pivot_to_course(instructor_info)
        return instructor_info
    except Exception as e:
        logging.error(f"Error getting instructor info from {url}: {e}")
        return []


In [None]:
# # use multithreading to get all the insturctor info
# iterable = df["instructor_page"].tolist()
# no_threads = 10
# iterable_length = len(iterable)
# instructor_info = tqdm_threadpool_map(get_instructor_info, iterable, no_threads, iterable_length)

links = df["instructor_page"].tolist()
instructor_info = []
for link in tqdm(links):
    logging.info(f"Getting instructor info from {link}")
    info = get_instructor_info(link)
    instructor_info.extend(info)
  


In [None]:
# instructor_info = pd.read_csv("instructor-info.csv")
instructor_info = pd.DataFrame(instructor_info)

In [None]:
def get_course_info(course_slug):
    """Gets course information from a course link.
    
    Parameters
    ----------
    course_slug : str
        Link to a course on Coursera.
        
    Returns
    -------
    dict
        Course information.
    """
    
    course_info_link = f"https://www.coursera.org/api/courses.v1?q=slug&slug={course_slug}"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    }
    request = requests.get(course_info_link, headers=headers)
    if request.status_code != 200:
        raise Exception(f"Request to {course_info_link} was not successful.")
    return request.json()

In [None]:
def parse_course_name(course_info):
    """Parses course name from course information.
    
    Parameters
    ----------
    course_info : dict
        Course information.
        
    Returns
    -------
    str
        Course name.
    """
    
    return course_info["elements"][0]["name"]

In [None]:
def parse_course_id(course_info):
    """Parses course id from course information.
    
    Parameters
    ----------
    course_info : dict
        Course information.
        
    Returns
    -------
    str
        Course id.
    """
    
    return course_info["elements"][0]["id"]

In [None]:
def multithreaded_course_info(course_slugs):
    """Gets course information from a list of course links.

    Gets course information from a list of course links using multithreading. Displays progress bar.
    
    Parameters
    ----------
    course_slugs : list
        List of course slugs.
        
    Returns
    -------
    list
        List of course information.
    """
    with ThreadPoolExecutor(max_workers=10) as executor:
        course_info = list(tqdm(executor.map(get_course_info, course_slugs), total=len(course_slugs)))
    
    return course_info

In [42]:
course_slugs = instructor_info["course_slug"].tolist()
course_info = multithreaded_course_info(course_slugs)

100%|██████████| 13944/13944 [15:37<00:00, 14.87it/s]


In [43]:
course_ids = [parse_course_id(info) for info in course_info]
course_names = [parse_course_name(info) for info in course_info]

In [44]:
instructor_info["course_id"] = course_ids
instructor_info["course_name"] = course_names

In [46]:
instructor_info.to_csv("instructor-content.csv", index=False)