## Gather All Coursera Courses



In [32]:
import requests
import bs4 as bs
import json

import pandas as pd

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [33]:
# Get link to every course on Coursera assuming sitemap is complete and up-to-date.

all_course_sitemap_link = "https://www.coursera.org/sitemap~www~courses.xml"
request = requests.get(all_course_sitemap_link)
soup = bs.BeautifulSoup(request.text, "xml")
course_links = soup.find_all("loc")
course_links = [link.text for link in course_links]
df = pd.DataFrame(course_links, columns=["course_link"])

In [34]:
# write links to csv. Clean by hand if necessary.

df = df.drop_duplicates()
df = df.sort_values(by="course_link")
df.to_csv("course_links.csv", index=False)

In [35]:
def get_course_slug(course_link):
    """Gets the course slug from a course link.
    
    Parameters
    ----------
    course_link : str
        Link to a course on Coursera.
        
    Returns
    -------
    str
        Course slug.
    """
    page = course_link.split("/")[-1]
    slug = page.split("?")[0]
    return slug
    


In [36]:
def get_course_info(course_link):
    """Gets course information from a course link.
    
    Parameters
    ----------
    course_link : str
        Link to a course on Coursera.
        
    Returns
    -------
    dict
        Course information.
    """
    
    course_slug = get_course_slug(course_link)
    course_info_link = f"https://www.coursera.org/api/courses.v1?q=slug&slug={course_slug}"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    }
    request = requests.get(course_info_link, headers=headers)
    if request.status_code != 200:
        raise Exception(f"Request to {course_info_link} was not successful.")
    return request.json()

In [37]:
def parse_course_name(course_info):
    """Parses course name from course information.
    
    Parameters
    ----------
    course_info : dict
        Course information.
        
    Returns
    -------
    str
        Course name.
    """
    
    return course_info["elements"][0]["name"]

In [38]:
def parse_course_id(course_info):
    """Parses course id from course information.
    
    Parameters
    ----------
    course_info : dict
        Course information.
        
    Returns
    -------
    str
        Course id.
    """
    
    return course_info["elements"][0]["id"]

In [39]:
def multithreaded_course_info(course_links):
    """Gets course information from a list of course links.
    
    Parameters
    ----------
    course_links : list
        List of course links.
        
    Returns
    -------
    list
        List of course information.
    """
    with ThreadPoolExecutor(max_workers=10) as executor:
        course_info = list(tqdm(executor.map(get_course_info, course_links), total=len(course_links)))
    
    return course_info

In [40]:
df = pd.read_csv("cleaned_course_links.csv")
course_links = df["course_link"].tolist()
course_info = multithreaded_course_info(course_links)


 14%|█▍        | 950/6568 [01:17<08:47, 10.65it/s]