## Coursera Course Scraper

Sends requests to Coursera and parses out course information using open public API endpoints

In [31]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [32]:
import requests
import logging
import time
import os

import pandas as pd

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [33]:
course_endpoint_url = "https://www.coursera.org/api/courses.v1?q=slug&slug={}"


In [34]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

output_log_file_name = f"{'coursera-courses'}-{time.strftime('%Y%m%d-%H%M%S')}.log"
file_path = os.path.join(os.getcwd(), "logs", output_log_file_name)
fh = logging.FileHandler(file_path)
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)

In [35]:
def get_all_courses():
    """Get all course links from coursera

    Returns
    -------
    courses : list
        List of all course links.
    """
    sitemap_url = "https://www.coursera.org/sitemap~www~courses.xml"
    response = requests.get(sitemap_url)
    if response.status_code != 200:
        logging.error("Failed to get sitemap")
        return []
    soup = BeautifulSoup(response.text, "lxml")
    courses = [loc.text for loc in soup.find_all("loc")]
    return courses

In [36]:
def get_slug(url):
    """Get slug from course url
    
    Parameters
    ----------
    url : str
        Course url.
        
    Returns
    -------
    slug : str
        Course slug.    
    """
    return url.split("/")[-1]

In [37]:
def get_course_id(slug):
    """Get course id from course slug
    
    Parameters
    ----------
    slug : str
        Course slug.
        
    Returns
    -------
    course_id : str
        Course id.    
    """
    response = requests.get(course_endpoint_url.format(slug))
    if response.status_code != 200:
        logging.error("Failed to get course id for slug: {}".format(slug))
        return None
    course_id = response.json()["elements"][0]["id"]
    return course_id

In [38]:
def tqdm_threadpool_map(func, iterable, no_threads, iterable_length, *args):
    """A threadpool map function that shows a progress bar.

    Parameters
    ----------
    func : function
        The function to apply to each element of the iterable.

    iterable : iterable
        The iterable to apply the function to.

    no_threads : int
        The number of threads to use.

    iterable_length : int
        The length of the iterable.

    *args : list
        The list of arguments to pass to the function.

    Returns
    -------
    results : list
        The list of results from the function.
    """
    with ThreadPoolExecutor(max_workers=no_threads) as executor:
        results = list(tqdm(executor.map(func, iterable, *args), total=iterable_length))
    return results

In [39]:
course_url = get_all_courses()
print("Total courses: ", len(course_url))

Total courses:  6968


In [40]:
course_slug = [get_slug(url) for url in course_url]

In [41]:
course_id = tqdm_threadpool_map(get_course_id, course_slug, 4, len(course_slug))

  5%|▌         | 363/6968 [00:23<06:55, 15.91it/s] 