In [1]:
#IMPORT LIBRARIES
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time
import random
import re
from bs4 import BeautifulSoup
import requests
import lxml
from lxml import html
import os
import pandas as pd

## 1.Data Collection
### 1.1. Get the list of master's degree courses

In [None]:
#1.1- Reach main URL
base_url = 'https://www.findamasters.com/masters-degrees/msc-degrees/?PG=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(base_url)
driver.maximize_window()

#WAIT FOR PUBLICITY
try:
    WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='signupModal']/div/div/div/i"))).click()
except:
    print("No pop up")
time.sleep(10)

#MASTER COURSES URL
masters_url = []
base_url = 'https://www.findamasters.com/'
base_url_page = 'https://www.findamasters.com/masters-degrees/msc-degrees/?PG='
for n in range(1,401):
    
    url = base_url_page + str(n)
    res = requests.get(url)
    time.sleep(random.uniform(5, 10))
    soup = BeautifulSoup(res.text, "lxml")
    
    masters = soup.find_all('a', {'class': 'courseLink text-dark', 'href':True})
    for master in masters:
        masters_url.append(base_url + master['href'])

In [None]:
#1.1- Checking we collected all the required URL's (6000)
print(len(masters_url))

In [None]:
#1.1- Saving the collected URL's in a txt file
with open('urls.txt', 'w') as file:
    # Write each URL to a new line
    for url in masters_url:
        file.write(url + '\n')

### 1.2. Crawl master's degree pages

In [2]:
#1.2- Saving our txt folder as a list 
with open('urls.txt', 'r') as file:
    # Read the file line by line
    urls = [line.strip() for line in file]

In [None]:
#1.2- Download the HTML of a URL and save it as a file in a folder
def download_and_save(url, i):
    try:
        driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.get(url)
        time.sleep(5)
        html = driver.page_source
        folder_name = f'folder_{i // 15}'
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        file_name = f'{i % 15}.html'
        with open(os.path.join(folder_name, file_name), 'w', encoding='utf-8') as f:
            f.write(html)
        driver.quit()
    except Exception as e:
        print(f"An error occurred while downloading {url}: {e}")

#### Due to the large number of pages we should download, we decided to use Python's concurrent.futures module which use multiple threads. Using this method we are able to execute the 'download_and_save' function over multiple URL's concurrently and to decrease the running time.

In [None]:
#1.2- Using multiple thread to execute the 'download_and_save' function
import concurrent.futures

with concurrent.futures.ThreadPoolExecutor() as executor:
  # Use the executor to map the function to the URLs
  executor.map(download_and_save, urls, range(len(urls)))

### 1.3 Parse downloaded pages

In [3]:
column_names = ["courseName",
                "universityName",
                "facultyName",
                "isItFullTime",
                "description",
                "startDate",
                "fees",
                "modality",
                "duration",
                "city",
                "country",
                "administration",
                "url"]

In [4]:
#1.3- Creating a Data Frame with the required columns
df = pd.DataFrame(columns = column_names)

In [5]:
#1.3- Collect the data from each HTML file and add it to the Data Frame
for i in range(400):
    folder_name = f'folder_{i}'
    for j in range(15):
        file_name = f'{j}.html'
        
        with open(os.path.join(folder_name, file_name), 'r', encoding='utf-8') as f:
            contents = f.read()
            soup = BeautifulSoup(contents, 'html.parser')
        
        try:
            courseName = soup.find("h1", {"class": "course-header__course-title"}).text
        except:
            courseName = ""
        
        try:
            universityName = soup.find('a', {'class': 'course-header__institution'}).text
        except:
            universityName = ""
        
        try:
            facultyName = soup.find('a', {'class': 'course-header__department'}).text
        except:
            facultyName = ""
        
        try:
            FullTime_links= soup.find_all('a', {'class':'concealLink' })
            FullTime = False
            for item in FullTime_links:
                if item['href']== "/masters-degrees/full-time/":
                    FullTime = True
                    break
            isItFullTime= FullTime
        except:
            isItFullTime= False
        
        try:
            paragraphs = soup.find("div", class_="course-sections course-sections__description col-xs-24")
            paragraphs = paragraphs.find("div", id="Snippet").find_all("p")
            description = " ".join([paragraph.text.strip() for paragraph in paragraphs])
        except:
            description = ""
        
        try:
            startDate = soup.find("span", {"class": "key-info__start-date"}).text
        except:
            startDate = "" 
        
        try:
            fee = soup.find("div", {"class": "course-sections__fees"}).text
        except:
            fee = ""
    
        try:
            modality = soup.find("span", {"class": "key-info__qualification"}).text
        except:
            modality = ""
                
        try:
            duration = soup.find("span", {"class": "key-info__duration"}).text
        except:
            duration = ""
            
        try:
            city = soup.find("a", {"class": "card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__city"}).text
        except:
            city= ""
        
        try:
            country = soup.find("a", {"class": "card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__country"}).text
        except:
            country = ""
        
        online_course = soup.find("a", {"class": "course-data__online"})
        on_campus_course = soup.find("a", {"class": "course-data__on-campus"})
        if online_course is None and on_campus_course is None:
            administration = ""
        elif online_course is None:
            administration = on_campus_course.text
        elif on_campus_course is None:
            administration = online_course.text
        else:
            administration = online_course.text + ', ' + on_campus_course.text
        
        url = soup.find("link", rel="canonical")["href"]
    
        data = {"courseName": courseName, 
                "universityName": universityName, 
                "facultyName": facultyName, 
                "isItFullTime": isItFullTime, 
                "description": description, 
                "startDate": startDate, 
                "fees": fee, 
                "modality": modality, 
                "duration": duration, 
                "city": city, 
                "country": country, 
                "administration": administration, 
                "url": url}
        
        df_t = pd.DataFrame(columns=column_names)
        df_t = pd.concat([df_t, pd.DataFrame([data], columns=df_t.columns)], ignore_index=True)
        df_t.to_csv(f"courses/course_{(i*15) +j}.tsv", sep="\t", index=False)
        df = df.append(data, ignore_index=True)

In [6]:
#1.3- Checking we added all the data (6000)
print(df.shape[0])

6000
