# Coursera Web Scraping

##  Import libraries

**get instructor and number of hours**

In [1]:
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint
import pandas as pd
import os
import json
import tqdm
import random
random.seed(42)

## Bring in Url List 

I obtained these url links by using **Web Scrapper** extension of google chrome because on coursera.org I couldn't get urls by using href tag. It gives me some other data also but I only use the link-href column from this dataframe.

In [2]:
urls = pd.read_csv('coursera.csv', delimiter = ';')

In [3]:
urls.head(3)

Unnamed: 0,web-scraper-order,web-scraper-start-url,link,link-href
0,1639224705-494,https://www.coursera.org/search?query=free&pag...,Matrix Algebra for EngineersThe Hong Kong Univ...,https://www.coursera.org/learn/matrix-algebra-...
1,1639224662-348,https://www.coursera.org/search?query=free&pag...,Budgeting essentials and developmentFundação I...,https://www.coursera.org/learn/budgeting-essen...
2,1639224613-177,https://www.coursera.org/search?query=free&pag...,1.- El Cálculo - Modelo LinealTecnológico de M...,https://www.coursera.org/learn/calculo-1


In [4]:
url_list = list(urls['link-href'])   # to make the column iterable

In [5]:
type(url_list)  

list

In [6]:
len(url_list)

1515

## Web Scrape data from Coursera

## Parse the Raw Data

In [7]:
ROOT = './raw_data_txt'

In [8]:
soups, paths = [], []
for ix, path in enumerate(tqdm.tqdm(os.listdir(ROOT))):
    raw_soup = open(os.path.join(ROOT, path), 'rb').read()
    soup = BeautifulSoup(raw_soup, 'html.parser')
    soups.append(soup)
    paths.append(path)

100%|███████████████████████████████████████| 1515/1515 [01:32<00:00, 16.44it/s]


In [36]:
get_abouts(soups[1281])

## Functions

In [9]:
def get_course_name(soup):
    name = None
    if soup.find_all('h1',{"class":"banner-title banner-title-without--subtitle m-b-0"})!= []:
        name = soup.find_all('h1',{"class":"banner-title banner-title-without--subtitle m-b-0"})[0].text
        
    elif soup.select('#rendered-content > div > div > div.rc-PdpPage > div:nth-child(4) > div:nth-child(1) > div > div > div > div > div > div._157odstq._vz5kef > div > div:nth-child(1) > div._kfriz5q > h1') != []:
        name = soup.select('#rendered-content > div > div > div.rc-PdpPage > div:nth-child(4) > div:nth-child(1) > div > div > div > div > div > div._157odstq._vz5kef > div > div:nth-child(1) > div._kfriz5q > h1')[0].text
    
    return name


In [10]:
def get_syllabus(soup):
    syl = soup.find_all("div", {"class": "_wmgtrl9 m-y-2"})
        
    desc = []
        
    for i in soup.select("div._wmgtrl9 > div > div > div > p"):
        desc.append(i.text)

    for i in syl:
        try:
            # print(i.contents[0].text)
            desc.append(i.contents[0].text)
        except: 
            #print(i.contents[0]) 
            desc.append(i.contents[0])
    return desc

In [11]:
def get_abouts(soup):
    try:
        about = soup.find_all("div", {"class": "content-inner"})[0].text
    except:
        about = None
    return about

In [12]:
def get_no_of_ratings(soup):
    no_of_ratings = None
    if soup.find_all('div',{"class":"_wmgtrl9"}):
        no_of_ratings = soup.find_all('div',{"class":"_wmgtrl9"})[0].text
        #print(no_of_ratings)
        if len(no_of_ratings)>15 or no_of_ratings == 'Week':
            no_of_ratings = None
    return no_of_ratings

In [37]:
get_no_of_ratings(soups[1281])

In [14]:
def get_rating(soup):
    rating = soup.select('#main > div._iul6hq > div > div > div > ul > li > a > div > span')
    if rating != []:
        rating = rating[0].text
    elif soup.select('#main > div._iul6hq > div > div > div > div > span') != []: 
        rating = soup.select('#main > div._iul6hq > div > div > div > div > span')[0].text
    else: 
        rating = None
    return rating

In [15]:
def get_category(soup):
    if soup.select('div._exc94g9 > div:nth-child(2) > a') != []:
        category =  soup.select('div._exc94g9 > div:nth-child(2) > a')[0].text
    else: 
        category = None
    return category
    
    
def get_subcategory(soup):
    try:
        subcategory = soup.select('div._exc94g9 > div:nth-child(3) > a')[0].text
    except:
        subcategory = None
    return subcategory

In [16]:
def get_level(soup):
    
    for i in range(1,6):
        lst = soup.select('#main > div > div.rc-XdpSection.cdp-about > div > div > div > div._xliqh9g > div > div > div:nth-child({}) > div._1tu07i3a > div._16ni8zai.m-b-0'.format(i))
        #print(lst)
        if lst:
            level = lst[0].text if lst[0].text in ['Beginner Level','Intermediate Level','Advanced Level','Mixed Level'] else None
            if level:
                return level
            

In [17]:
def get_language(soup):
    try:
        language = soup.select('#main > div > div.rc-XdpSection.cdp-about > div > div > div > div._xliqh9g > div > div > div:last-child > div._1tu07i3a > div._16ni8zai.m-b-0')[0].text        
    except:
        language = None
    return language

In [18]:
def get_no_of_students(soup):
    try:
        no_of_students=soup.find_all('div',{"class":"_1fpiay2"})[0].text
    except: 
        no_of_students = None
    return no_of_students

In [51]:
def get_instructor(soup):
    try:
        instructor = soup.select('''#main > div > div.rc-XdpSection.cdp-instructors > 
                                div > div > div:nth-child(2) > div > div > div > a > 
                               div > div._wtdnuob > h3''')[0].text
    except:
        instructor = None
    return instructor

In [78]:
course_name = []
instructors = []
levels = []
categories = []
subcategories = []
noofstudents = []
ratings = []
noofrating = []
abouts = []
syllabuss = []
languages = []

for ix, soup in enumerate(tqdm.tqdm(soups)):
    
        course_name.append(get_course_name(soup))
        
        instructors.append(get_instructor(soup))
    
        syllabuss.append(get_syllabus(soup))
    
        abouts.append(get_abouts(soup))
        
        noofrating.append(get_no_of_ratings(soup))
        
        ratings.append(get_rating(soup))
        
        categories.append(get_category(soup))
        
        subcategories.append(get_subcategory(soup))
        
        levels.append(get_level(soup))
        
        languages.append(get_language(soup))
        
        noofstudents.append(get_no_of_students(soup))

100%|███████████████████████████████████████| 1515/1515 [03:26<00:00,  7.35it/s]


In [79]:
len(levels)

1515

In [80]:
len(instructors)

1515

In [81]:
len(abouts)

1515

In [82]:
len(categories)

1515

In [83]:
len(ratings)

1515

In [84]:
len(noofrating)

1515

In [85]:
len(syllabuss)

1515

In [86]:
len(course_name)

1515

In [87]:
len(languages)

1515

In [88]:
len(noofstudents)

1515

In [89]:
df = pd.DataFrame({'name':course_name,'instructor':instructors,
                   'level':levels,'category':categories, 'subcategory':subcategories, 
                   'no_of_students':noofstudents,
                    'rating':ratings,'no_of_rating':noofrating,'about':abouts,'syllabus':syllabuss, 
                    'language':languages})


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1515 entries, 0 to 1514
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            1513 non-null   object
 1   instructor      1513 non-null   object
 2   level           981 non-null    object
 3   category        1513 non-null   object
 4   subcategory     1513 non-null   object
 5   no_of_students  1318 non-null   object
 6   rating          1364 non-null   object
 7   no_of_rating    1364 non-null   object
 8   about           1513 non-null   object
 9   syllabus        1515 non-null   object
 10  language        1513 non-null   object
dtypes: object(11)
memory usage: 130.3+ KB


In [91]:
df['level'].value_counts()   # empty cells will be filled by 'Mixed Level'

Beginner Level        691
Intermediate Level    261
Advanced Level         29
Name: level, dtype: int64

## Take only Courses in English

In [92]:
df_english = pd.DataFrame(df[df['language']=='English'], columns = df.columns)

In [93]:
df_english.reset_index(drop=True, inplace=True)

In [94]:
df_english.head()

Unnamed: 0,name,instructor,level,category,subcategory,no_of_students,rating,no_of_rating,about,syllabus,language
0,Meditation: A way to achieve your goals in you...,Duck-Joo Lee,Beginner Level,Arts and Humanities,Philosophy,"78,489 already enrolled",4.6stars,843 ratings,Do we truly think that we have lived for ourse...,[Self- reflection is the methodology of medita...,English
1,Introduction to the Arctic: Climate,"Paul Myers, Ph.D",,Physical Science and Engineering,Environmental Science and Sustainability,"13,439 already enrolled",4.6stars,420 ratings,"The University of Alberta, the University of T...",[This course is about the remarkable northern ...,English
2,Data Science in Stratified Healthcare and Prec...,Dr Areti Manataki,Intermediate Level,Data Science,Data Analysis,"16,421 already enrolled",4.6stars,241 ratings,An increasing volume of data is becoming avail...,[Join us this week to find out how the course ...,English
3,How to Get Skilled: Introduction to Individual...,Dr. Valeri Chukhlomin,Intermediate Level,Personal Development,Personal Development,"50,038 already enrolled",3.7stars,214 ratings,Do you want to gain a competitive edge on the ...,[The goal of the Module is to define individua...,English
4,Circular Economy - Sustainable Materials Manag...,Philip Peck,Beginner Level,Physical Science and Engineering,Environmental Science and Sustainability,"37,682 already enrolled",4.8stars,"1,136 ratings",This course looks at where important materials...,[This module explores sourcing of materials fr...,English


In [95]:
df_english.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            1004 non-null   object
 1   instructor      1004 non-null   object
 2   level           669 non-null    object
 3   category        1004 non-null   object
 4   subcategory     1004 non-null   object
 5   no_of_students  922 non-null    object
 6   rating          942 non-null    object
 7   no_of_rating    942 non-null    object
 8   about           1004 non-null   object
 9   syllabus        1004 non-null   object
 10  language        1004 non-null   object
dtypes: object(11)
memory usage: 86.4+ KB


In [96]:
df_english[df_english['no_of_students'].isnull()]

Unnamed: 0,name,instructor,level,category,subcategory,no_of_students,rating,no_of_rating,about,syllabus,language
8,Preparing for and Passing Technical Certificat...,Rachel Strong,Beginner Level,Personal Development,Personal Development,,4.8stars,21 ratings,Technology provides you with great opportuniti...,[Technology provides you with great opportunit...,English
16,Community Awareness: Police Brutality in the U...,Teach-Out Experts,Beginner Level,Social Sciences,Governance and Society,,,,"The tragic deaths of George Floyd, Breonna Tay...",[An introductory history to the roots of moder...,English
26,Patient Perspectives on Medications: Qualitati...,Susanne Kaae,Intermediate Level,Health,Patient Care,,,,This course teaches you how to explore the pat...,[This module starts with a presentation of the...,English
38,Formation COVID-19 pour personnels de santé,Matthew Strehlow,,Health,Patient Care,,4.5stars,11 ratings,La COVID-19 se propage rapidement à travers le...,"[Dans ce module, vous allez apprendre les sign...",English
64,Why Iowa? A Primer on Primaries and Caucuses T...,Faculty Experts,Beginner Level,Social Sciences,Governance and Society,,,,The United States presidential election proces...,"[, What is a Teach-Out?, Welcome to the Why Io...",English
...,...,...,...,...,...,...,...,...,...,...,...
953,Exploring Basic Income in a Changing Economy T...,Luke Shaefer,Beginner Level,Business,Finance,,,,The United States social safety net is a compl...,"[, , What is a Teach-Out?, Introduction: Luke ...",English
973,1C:Enterprise Junior Developer Course,Anton Shviykovskiy,Beginner Level,Computer Science,Software Development,,,,Completing this first level will enable you to...,[Acquaintance with the basic capabilities of t...,English
988,LGBTQ Pride: From Origins to Evolution Teach-Out,Spectrum Center,Beginner Level,Arts and Humanities,Philosophy,,4.7stars,46 ratings,"At its origin in the late 60s and early 70s, P...","[, What is a Teach-Out?, Welcome to the LGBTQ ...",English
992,Genetic Epidemiology Foundations,Todd Edwards,Intermediate Level,Health,Health Informatics,,,,This course is presented by the University of ...,"[Taught by Dr. Nancy Cox, Vanderbilt Universit...",English


In [97]:
df_english['syllabus'][67]

['Welcome to week 1! In lesson one, you will learn to recognize the six categories of engineering materials through examples from everyday life, and we’ll discuss how the structure of those materials leads to their properties. Lesson two explores how point defects explain solid state diffusion. We will illustrate crystallography – the atomic-scale arrangement of atoms that we can see with the electron microscope. We will also describe the Arrhenius Relationship, and apply it to the number of vacancies in a crystal. We’ll finish by discussing how point defects facilitate solid state diffusion, and applying the Arrhenius Relationship to solid state diffusion.',
 'Welcome to week 2! In lesson three we will discover how dislocations at the atomic-level structure of materials explain plastic (permanent) deformation. You will learn to define a linear defect and see how materials deform through dislocation motion. Lesson four compares stress versus strain, and introduces the “Big Four” mechan

In [98]:
for i in df_english['no_of_students']:
    if i:
        if 'enrolled' not in i:
            print(i)

In [99]:
for i in df_english['rating']:
    if i:
        if 'stars' not in i:
            print(i)

In [100]:
for i in df_english['no_of_rating']:
    if i:
        if 'rating' not in i:
            print(i)

In [103]:
df_english.to_csv('courses_in_english.csv')

In [104]:
def get_skills(soup):
    name = []
    subcategory = []
    skills = []
    for i in range(0,10):
        if soup.select('#main > div > div.rc-XdpSection.cdp-about > div > div > div > div._1b7vhsnq.m-t-2 > div.Skills.m-y-2.p-x-2.p-t-1.p-b-2.border-a.css-1rj0z6b > ul > li:nth-child({}) > span > span > span._ontdeqt'.format(i)):
            name.append(get_course_name(soup))
            #print(name)
            subcategory.append(get_subcategory(soup))
            #print(subcategory)
            skills.append(soup.select('#main > div > div.rc-XdpSection.cdp-about > div > div > div > div._1b7vhsnq.m-t-2 > div.Skills.m-y-2.p-x-2.p-t-1.p-b-2.border-a.css-1rj0z6b > ul > li:nth-child({}) > span > span > span._ontdeqt'.format(i))[0].text)
            #print(skills)
        else: 
            None
    return name, subcategory, skills
        

In [105]:
name_list = []
subcategory_list = []
skill_list = []
for soup in tqdm.tqdm(soups):
    name_list += get_skills(soup)[0]
    subcategory_list += get_skills(soup)[1]
    skill_list += get_skills(soup)[2]


100%|███████████████████████████████████████| 1515/1515 [08:43<00:00,  2.89it/s]


In [106]:
len(name_list)

2014

In [107]:
len(subcategory_list)

2014

In [108]:
len(skill_list)

2014

In [109]:
df_tableau = pd.DataFrame({'name':name_list,'subcategory':subcategory_list,'skills':skill_list})

In [111]:
df_tableau['skills'].value_counts()

Management                     19
Communication                  19
History                        19
Leadership                     17
Planning                       17
                               ..
UiPath Orchestrator             1
Orchestrator user interface     1
Queues                          1
Mental Health Education         1
Causality                       1
Name: skills, Length: 1204, dtype: int64

In [112]:
df_tableau.to_csv('skills.csv')

# JUST IN CASE

In [None]:
import csv

In [None]:
data = []
with open('coursera.csv', 'r') as f:
    spamreader = csv.reader(f, delimiter=',')
    for row in spamreader:
        data.append(row)

In [None]:
fixed = [d[0].split(',"')[:4] for d in data[1:]]

In [None]:
fixed = [[elm.split('";;;;')[0] for elm in f] for f in fixed]

In [None]:
with open('new_urls.csv', 'w', newline='') as file:
    keys = data[0][0].split(';')
    writer = csv.DictWriter(file, fieldnames=keys, delimiter=',')
    writer.writeheader()
    
    for row in fixed:
        writer.writerow({k:v for k,v in zip(keys, row)})