<a href="https://colab.research.google.com/github/TK-Problem/Coursera-scrapper/blob/master/coursera_review_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Load libraries
import requests # for running HTML requests
from tqdm import tqdm # for monitoring progress
from bs4 import BeautifulSoup # for parsing raw HTML code and extracting usefull data
import pandas as pd # for reading csv file
import csv # for working with .scv fiels

# mount google drive
mount_gdrive = True # @param{type:"boolean"}
if mount_gdrive:
  from google.colab import drive
  drive.mount('/content/drive')

#### Option 1

![example](https://i0.wp.com/neptune.ai/wp-content/uploads/2022/10/colab-upload.png?ssl=1 "Placeholder fig. title")

or one can load from google drive (faster option)

#### Option 2

![example](https://i0.wp.com/neptune.ai/wp-content/uploads/2022/10/colab-file.png?ssl=1 "Placeholder fig. title")



In [None]:
#@title Read URLs

#@markdown Upload data before running this cell upload
#@markdown `.csv` files manually into enviroment (see image above).

file_path = ''  #@param {type: "string"}

# read data
try:
  df = pd.read_csv(file_path, header=None)
  # select unique URLS
  urls = df.iloc[:, 0].unique()
  # add columns
  df.columns = ['URL', "Week_no", "Week_name", "Est_time_to_compelte", "Week_description"]

# file path/fole not found
except FileNotFoundError:
  print("File not found.")
  urls = []

# return status message
print(f"{len(urls)} courses found")

In [None]:
#@title Read course reviews

output_path = ''  #@param {type: "string"}

# iterate over URLS
for url in tqdm(df.URL.unique()):
  # make request
  _url_1 = f"https://www.coursera.org{url}"
  page = requests.get(_url_1)
  
  # convert to bs4 object for parcing html
  soup = BeautifulSoup(page.text, 'html.parser')
  
  # check for review page
  try:
    rew_button = soup.find("a", {"data-e2e": "reviews-page-link"})['href']
  except:
    # no reviews available
    rew_button = ''

  # tmp. list to store all reviews
  _data = []

  # visit all reviews
  if rew_button:
    # make request
    _url_2 = f"https://www.coursera.org{rew_button}"
    page = requests.get(_url_2)
    
    # convert to bs4 object for parcing html
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # find navigation bar all entries
    nav_bar = soup.find("nav", {"aria-label": "Pagination Controls"}).find_all("li")
    
    # get the last available tab number
    _no_pages = int(nav_bar[-2].text.strip())
    if _no_pages == 400: _no_pages = 399
    
    # iterate over pages
    for i in range(1, _no_pages+1):
      # generate link and visit website
      _url_3 = f"https://www.coursera.org/learn/machine-learning/reviews?sort=recent&page={i}"
      page = requests.get(_url_3)

      # convert to bs4 object for parcing html
      soup = BeautifulSoup(page.text, 'html.parser')

      # find all reviews
      reviews = soup.find_all("div", {"class": "cds-63 review review-text review-page-review m-b-2 css-0 cds-64"})

      for r in reviews:
        # extract review info
        rev_name = r.find("p", {"class": "cds-33 reviewerName p-x-1s css-14d8ngk cds-35"}).text[3:]
        rev_date = r.find("p", {"class": "cds-33 dateOfReview p-x-1s css-14d8ngk cds-35"}).text
        rev_text = r.find("div", {"data-testid": "cml-viewer"}).text
        # count course starts
        stars = r.find("div", {"class": "_1mzojlvw"}).find_all("span")
        start_cnt = 0
        for s in stars:
          if s.title.text == "Filled Star":
            start_cnt += 1

        # add data
        _data.append([url, rev_name, rev_date, rev_text, start_cnt])
  else:
    # return message
    print(f"For {url} no reviews page was found.")

  # write all recorded reviews to .csv file
  with open(output_path, 'a', encoding='UTF8', newline='') as f:
    # create writer object
    writer = csv.writer(f)

    # iterate over rows
    for _row in _data:
        # save line
        writer.writerow(_row)