### 1. DATA COLLECTION

In [None]:
pip install bs4

In [None]:
pip install selenium

In [5]:
#import the required packages
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import json

#### 1.1. Get the list of master's degree courses

In [147]:
flag = False # run only once

if flag:
    # URL of the website
    url = 'https://www.findamasters.com/masters-degrees/msc-degrees/?PG='  
    prefix = '/masters-degrees/course/'
    exclude = ['\nMore details \n', '\nRead more \n', '\xa0Video(s)', '\xa0Student Profile(s)'] 

    # Create a list to store the URLs of the masters
    master_urls = []

    # Loop through the first 400 pages
    for page_number in range(1, 401):
        print(url + str(page_number))
        response = requests.get(url + str(page_number))

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
        
            # Use BeautifulSoup to extract the URLs and append them to the master_urls list
            for link in soup.find_all('a', {'class':'courseLink'}):
                if link['href'][:len(prefix)] == prefix and not link.text in exclude:
                    master_urls.append((link['href'], link.text))


    # Save the collected URLs in a text file
    with open("master_urls.txt", "a") as file:
        for url in master_urls:
            file.write(url[0] + "\n")

#### 1.2. Crawl master's degree pages

In [148]:
flag = False

if flag:
    # Open the file and read its content
    with open("master_urls.txt", "r") as file:
        master_urls = [line.strip() for line in file.readlines()]

    # Create a directory to store HTML pages
    output_root_directory = "html_pages"
    os.makedirs(output_root_directory, exist_ok=True)  # create the directory if it doen't exist

    # Read 15 URLs at a time and create HTML pages
    subset_size = 15
    for i in range(0, len(master_urls), subset_size):
        subset = master_urls[i:i + subset_size] # extract 15 more urls

        # Create a subfolder for each page
        output_directory = os.path.join(output_root_directory, f"page_{i // subset_size + 1}")
        os.makedirs(output_directory, exist_ok=True)

        # Create an HTML page for each URL in the subset
        for url in subset:
            prefix = 'https://www.findamasters.com/'
            response = requests.get(prefix + url)  # sends a GET request 

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                page_content = soup.prettify()  # extract the content of the page          
                master_name = url.split("/")[-2]  # extract the name of the master from the URL 

                # Check if the file already exists, and append a number if necessary
                page_filename = f"{output_directory}/{master_name}.html"
                counter = 1
                while os.path.exists(page_filename):
                    page_filename = f"{output_directory}/{master_name}({counter}).html"
                    counter += 1

                # Save the content in a HTML file
                with open(page_filename, "w", encoding="utf-8") as file:
                    file.write(page_content)
            time.sleep(1.5)

        print(f"page {i // subset_size + 1} completed")
   

#### 1.3 Parse downloaded pages

In [149]:
##MY IDEA --> I have to create a function that retrieve the required information from a html page. in order to do so I exploit the structure of the file. 
#I use "try: except:" to avoid errors in the case that some information is missing

def extract_msc_page(msc_page_url):


    contents ={}
    # Parse HTML content
    page_soup = BeautifulSoup(msc_page_url, 'html.parser')

    #url
    try:
        canonical_link = page_soup.find('link', {'rel': 'canonical'})
        contents['url'] = canonical_link.get('href')
    except AttributeError:
        contents['url'] = ""


    # Course name
    try:
        page_links = page_soup.find('h1', {'class': 'text-white course-header__course-title'})
        name = page_links.get_text()
        contents["courseName"] = name.strip()
    except AttributeError:
        contents['courseName'] = ""

    # University name
    try:
        page_links = page_soup.find_all('a', {'class': 'course-header__institution'})
        contents['universityName'] = page_links[0].contents[0].strip()
    except (AttributeError, IndexError):
        contents['universityName'] = ""

    # Faculty name
    try:
        page_links = page_soup.find_all('a', {'class': 'course-header__department'})
        contents['facultyName'] = page_links[0].contents[0].strip()
    except (AttributeError, IndexError):
        contents['facultyName'] = ""

    # Full time
    try:
        page_links = page_soup.find('a', {'class': 'inheritFont concealLink text-decoration-none text-gray-600'})
        time = page_links.get_text().strip()
        contents['isItFullTime'] = time
    except AttributeError:
        contents['isItFullTime'] = ""

    # Description
    try:
        page_links = page_soup.find('div', {'id': 'Snippet'})
        description = page_links.get_text().strip()
        contents["description"] = description
    except AttributeError:
        contents['description'] = ""

    # Starting date
    try:
        page_links = page_soup.find('span', {'class': 'key-info__content key-info__start-date py-2 pr-md-3 text-nowrap d-block d-md-inline-block'})
        starting = page_links.get_text().strip()
        contents["startDate"] = starting
    except AttributeError:
        contents['startDate'] = ""

    # Fees
    try:
        page_links = page_soup.find('div', {'class': 'course-sections course-sections__fees tight col-xs-24'})
        fees = page_links.get_text().strip()
        contents["fees"] = fees
    except AttributeError:
        contents['fees'] = ""

    # Modality
    try:
        page_links = page_soup.find('a', {'title': 'View all MSc courses'})
        modality = page_links.get_text().strip()
        contents["modality"] = modality
    except AttributeError:
        contents['modality'] = ""

    # Duration
    try:
        page_links = page_soup.find('span', {'class': 'key-info__content key-info__duration py-2 pr-md-3 d-block d-md-inline-block'})
        duration = page_links.get_text().strip()
        contents["duration"] = duration
    except AttributeError:
        contents['duration'] = ""

    # City
    try:
        page_links = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__city'})
        city = page_links.get_text().strip()
        contents["city"] = city
    except AttributeError:
        contents['city'] = ""

    # Country
    try:
        page_links = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__country'})
        country = page_links.get_text().strip()
        contents["country"] = country
    except AttributeError:
        contents['country'] = ""

    # Administration
    try:
        page_links = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__on-campus'})
        administration = page_links.get_text().strip()
        contents["administration"] = administration
    except AttributeError:
        contents["administration"] = ""

    
    return contents


In [150]:
## MY IDEA --> I apply the function above for all the files. I am applying it for all files in a folder for all folders contained into a folder

flag = False

if flag:
    folder_path = 'html_pages/'

    result_df = pd.DataFrame()
    # Walk through the directory and its subdirectories
    for foldername, subfolders, filenames in os.walk(folder_path):
        # Iterate through all the files in the current subdirectory
        for filename in filenames:
            # Construct the file path
            path = os.path.join(foldername, filename)
        
            # Check if the file exists before attempting to open it
            if os.path.exists(path):
                # Print the file path
                print(path)
            
                # Open the file in read mode with UTF-8 encoding
                with open(path, 'r', encoding='utf-8') as file:
                    # Read the HTML content of the file
                    html_content = file.read()
                
                    # Call the function to extract information from the HTML content
                    result_dict = extract_msc_page(html_content)
                    result_df = result_df.append(result_dict, ignore_index=True)


            else:
                # Print a message if the file is not found
                print(f"File not found: {path}")


    #clean an imperfection in the fees section
    result_df['fees'] = result_df['fees'].str.replace('Fees', '') 

    #save it into a file to store it. so that I do not have to run it again
    result_df.to_json('html_pages.json', orient='records', lines=True)


In [151]:
#create the tsv files 

flag = False

if flag:
    data = pd.read_json("html_pages.json", lines=True)

    output_directory = "tsv_files/"
    os.makedirs(output_directory, exist_ok=True)

    # Iterate through each row and create a TSV file
    for index, row in data.iterrows():
        file_name = f"course_{index}.tsv"
        file_path = os.path.join(output_directory, file_name)

        # Extract relevant columns and write to the TSV file
        selected_columns = ['courseName', 'universityName', 'facultyName', 'isItFullTime', 'description', 'startDate',
                        'fees','modality','duration','city','country','administration', 'url']
        selected_data = row[selected_columns]
        selected_data.to_csv(file_path, sep='\t', index=False, header=False)

    print("TSV files created successfully.")

### 2. Search Engine

##### 2.0.0) Preprocessing the text

In [6]:
import nltk
from nltk.stem import *
from nltk.corpus import stopwords # import stopwords module


dataset = pd.read_json("html_pages.json", lines=True)  # read the dataset from the created json file
dataset = dataset[dataset.description != '']  # filter rows where the 'description' is empty

# STEMMING
stemmer = PorterStemmer()  # create an instance of Porter Stemmer
dataset['preprocessed_description'] = dataset.description.apply(lambda row: [stemmer.stem(word) for word in row.split(' ')])  # reduce words of description column to their root form and create a new column to store the result

# STOPWORDS
nltk.download('stopwords') 
list_stopwords = stopwords.words('english')  # retrieves the English stopwords from the nltk stopwords dataset
dataset['preprocessed_description'] = dataset.description.apply(lambda row: [stemmer.stem(word) for word in row.split(' ') if not word in list_stopwords])  # now 'descr_clean' column contains lists of cleaned and stemmed words 

# PUNCTUATION
nltk.download('punkt')
dataset['preprocessed_description'] = dataset.description.apply(lambda row: [stemmer.stem(word) for word in nltk.word_tokenize(row) if not word in list_stopwords and word.isalnum()]) # now 'descr_clean' column contains lists of cleaned and stemmed words without punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### 2.0.1) Preprocessing the fees column

In [None]:
pip install forex-python

In [7]:
import re
from forex_python.converter import CurrencyRates

# Regular expression pattern to extract fees 
pattern = r'([$£€])\s*([\d,]+(?:\.\d+)?)\*?'

# Define a function to extract prices and convert to a common currency
def extract_and_convert(row, common_currency="USD"):
    matches = re.finditer(pattern, row)
    
    max_value = 0
    max_match = None
    
    for match in matches:
        currency_symbol = match.group(1)
        numeric_part = match.group(2)
        
        # Try to convert numeric part to float
        try:
            numeric_value = float(numeric_part.replace(',', ''))  # Convert to float removing the comma 
        except ValueError:
            continue
        
        if numeric_value > max_value:
            max_value = numeric_value
            max_match = match
    
    if max_match:
        # Convert the price to the common currency
        converted_price = convert_currency(max_match.group(1), max_value, common_currency="USD")
        return converted_price
    else:
        return None

"""
# Define a function to convert currency to a common currency
def convert_currency(original_currency, amount, common_currency):
    c = CurrencyRates()
    conversion_rate = c.get_rate(original_currency, common_currency)
    converted_amount = float(amount) * conversion_rate
    return (converted_amount)
"""

# Define conversion rates for currencies in the dataset to USD, using a dictionary
conversion_rates = {
    '£': 1.37,  # Example conversion rate for GBP to USD
    '€': 1.18,  # Example conversion rate for EUR to USD
    '$': 1.0
}


def convert_currency(original_currency, amount, common_currency):
    
    conversion_rate = conversion_rates.get(original_currency) 
    
    if conversion_rate is not None:
        converted_amount = float(amount) * conversion_rate
        return (converted_amount)
    else:
        return None


# Apply the function to the 'fees' column and create a new 'converted_price' column
dataset['fees(USD)'] = dataset['fees'].apply(lambda x: extract_and_convert(x, common_currency='USD'))
dataset

Unnamed: 0,url,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,preprocessed_description,fees(USD)
0,https://www.findamasters.com/masters-degrees/c...,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,\n \n\n\n\n Please see th...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,"[3d, visualis, anim, play, role, mani, area, p...",
1,https://www.findamasters.com/masters-degrees/c...,"Accounting, Accountability & Financial Managem...",King’s College London,King’s Business School,Full time,"Our Accounting, Accountability & Financial Man...",September,\n \n\n\n\n Please see th...,MSc,1 year FT,London,United Kingdom,On Campus,"[our, account, account, financi, manag, msc, c...",
2,https://www.findamasters.com/masters-degrees/c...,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,"\n \n\n\n UK: £18,000 (Tot...",MSc,1 year full time,Leeds,United Kingdom,On Campus,"[busi, govern, reli, sound, financi, knowledg,...",47607.5
3,https://www.findamasters.com/masters-degrees/c...,"Accounting, Financial Management and Digital B...",University of Reading,Henley Business School,Full time,Embark on a professional accounting career wit...,September,\n \n\n\n\n Please see th...,MSc,1 year full time,Reading,United Kingdom,On Campus,"[embark, profession, account, career, academ, ...",
4,https://www.findamasters.com/masters-degrees/c...,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",Full time,Join us for an online session for prospective ...,September,\n \n\n\n\n Please see th...,MSc,One year FT,London,United Kingdom,On Campus,"[join, us, onlin, session, prospect, student, ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,https://www.findamasters.com/masters-degrees/c...,Bioinformatics MSc,University of Liverpool,Department of Life Sciences,Full time,Life sciences and technology are an integral p...,September,\n \n\n\n UK fees (applies...,MSc,"1 year full time, 2 years part time",Liverpool,United Kingdom,On Campus,"[life, scienc, technolog, integr, part, global...",33907.5
5996,https://www.findamasters.com/masters-degrees/c...,BioInnovation - MSc,Aberystwyth University,"Biological, Environmental & Rural Sciences (IB...",Part time,MSc BioInnovation at Aberystwyth University pr...,"September, January",\n \n\n\n\n Please see th...,MSc,5 years part time,Aberystwyth,United Kingdom,On Campus,"[msc, bioinnov, aberystwyth, univers, provid, ...",
5997,https://www.findamasters.com/masters-degrees/c...,Biological Photography and Imaging MSc,University of Nottingham,School of Life Sciences,Full time,Have you ever wanted to work on nature documen...,September,\n \n\n\n\n Please see th...,MSc,Full time - 12 months,Nottingham,United Kingdom,On Campus,"[have, ever, want, work, natur, documentari, a...",
5998,https://www.findamasters.com/masters-degrees/c...,"Biological Sciences (M.A., M.S.)",St. Cloud State University,Postgraduate Programs,Full time,You will gain in-depth knowledge and skills in...,See Course,"\n \n\n\n $7,385* per year...",MSc,2 years,St Cloud,USA,On Campus,"[you, gain, knowledg, skill, chosen, disciplin...",7385.0


### 2.1. Conjunctive query

#### 2.1.1) Create your index!

In [8]:
vocabulary = set()
dataset.preprocessed_description.apply(lambda row: [vocabulary.add(word) for word in row])


0       [None, None, None, None, None, None, None, Non...
1       [None, None, None, None, None, None, None, Non...
2       [None, None, None, None, None, None, None, Non...
3       [None, None, None, None, None, None, None, Non...
4       [None, None, None, None, None, None, None, Non...
                              ...                        
5995    [None, None, None, None, None, None, None, Non...
5996    [None, None, None, None, None, None, None, Non...
5997    [None, None, None, None, None, None, None, Non...
5998    [None, None, None, None, None, None, None, Non...
5999    [None, None, None, None, None, None, None, Non...
Name: preprocessed_description, Length: 5979, dtype: object