In [1]:
#IMPORT LIBRARIES
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time
import random
import re
from bs4 import BeautifulSoup
import requests
import lxml
from lxml import html
import os
import pandas as pd

## 2. Search Engine

### 2.0 Preprocessing

#### 2.0.0) Preprocessing the text

In [2]:
dataset = pd.read_csv("merged_courses.tsv", delimiter="\t")

In [3]:
dataset.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fee,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,True,\n\n3D visualisation and animation play a role...,September,\nFees\n\nPlease see the university website fo...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,True,\n\nBusinesses and governments rely on sound f...,September,"\nFees\n\nUK: £18,000 (Total) International: £...",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...
2,"Accounting, Accountability & Financial Manag...",King’s College London,King’s Business School,True,"\n\nOur Accounting, Accountability & Financial...",September,\nFees\n\nPlease see the university website fo...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...
3,"Accounting, Financial Management and Digital...",University of Reading,Henley Business School,True,\n\nEmbark on a professional accounting career...,September,\nFees\n\nPlease see the university website fo...,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...
4,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",True,\n\nJoin us for an online session for prospect...,September,\nFees\n\nPlease see the university website fo...,MSc,One year FT,London,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...


#### Stemming with NLTK

In [4]:
from nltk.stem import *

In [6]:
dataset = dataset[dataset.description != '']

In [7]:
dataset = dataset.dropna(subset=['description'])

In [8]:
# Replace '\n' with a space or empty string as you prefer
dataset['description'] = dataset['description'].str.replace('\n', ' ', regex=False)
dataset.description.head()

0      3D visualisation and animation play a role i...
1      Businesses and governments rely on sound fin...
2      Our Accounting, Accountability & Financial M...
3      Embark on a professional accounting career w...
4      Join us for an online session for prospectiv...
Name: description, dtype: object

##### Port Stemmer

In [5]:
porterstemmer = PorterStemmer()

In [9]:
dataset['descr_stem'] = dataset.description.apply(lambda row: [porterstemmer.stem(word) for word in row.split(' ')])

In [11]:
lst_porter = [porterstemmer.stem(word) for word in dataset.loc[0,'description'].split(' ')]
lst_porter

['',
 '',
 '3d',
 'visualis',
 'and',
 'anim',
 'play',
 'a',
 'role',
 'in',
 'mani',
 'areas,',
 'and',
 'the',
 'popular',
 'of',
 'these',
 'media',
 'just',
 'keep',
 'growing.',
 'digit',
 'anim',
 'provid',
 'the',
 'eye-catch',
 'special',
 'effect',
 'in',
 'the',
 '21st',
 "century'",
 'favourit',
 'film',
 'and',
 'televis',
 'shows;',
 '3d',
 'design',
 'is',
 'also',
 'essenti',
 'to',
 'everyday',
 'work',
 'in',
 'everyth',
 'from',
 'comput',
 'game',
 'development,',
 'onlin',
 'virtual',
 'world',
 'develop',
 'and',
 'industri',
 'design',
 'to',
 'marketing,',
 'product',
 'design',
 'and',
 "architecture.gcu'",
 'programm',
 'in',
 '3d',
 'design',
 'for',
 'virtual',
 'environ',
 'will',
 'help',
 'you',
 'develop',
 'the',
 'skill',
 'to',
 'thrive',
 'in',
 'a',
 'success',
 'career',
 'as',
 'a',
 'visual',
 'designer.',
 'the',
 'programm',
 'is',
 'practic',
 'and',
 'career-focused,',
 'orient',
 'toward',
 'current',
 'industri',
 'needs,',
 'technolog',
 '

##### SnowBall Stemmer

In [10]:
snowstem = snowball.SnowballStemmer('english')

In [12]:
lst_snow = [snowstem.stem(word) for word in dataset.loc[0,'description'].split(' ')]
lst_snow

['',
 '',
 '3d',
 'visualis',
 'and',
 'anim',
 'play',
 'a',
 'role',
 'in',
 'mani',
 'areas,',
 'and',
 'the',
 'popular',
 'of',
 'these',
 'media',
 'just',
 'keep',
 'growing.',
 'digit',
 'anim',
 'provid',
 'the',
 'eye-catch',
 'special',
 'effect',
 'in',
 'the',
 '21st',
 'centuri',
 'favourit',
 'film',
 'and',
 'televis',
 'shows;',
 '3d',
 'design',
 'is',
 'also',
 'essenti',
 'to',
 'everyday',
 'work',
 'in',
 'everyth',
 'from',
 'comput',
 'game',
 'development,',
 'onlin',
 'virtual',
 'world',
 'develop',
 'and',
 'industri',
 'design',
 'to',
 'marketing,',
 'product',
 'design',
 'and',
 'architecture.gcu',
 'programm',
 'in',
 '3d',
 'design',
 'for',
 'virtual',
 'environ',
 'will',
 'help',
 'you',
 'develop',
 'the',
 'skill',
 'to',
 'thrive',
 'in',
 'a',
 'success',
 'career',
 'as',
 'a',
 'visual',
 'designer.',
 'the',
 'programm',
 'is',
 'practic',
 'and',
 'career-focused,',
 'orient',
 'toward',
 'current',
 'industri',
 'needs,',
 'technolog',
 'an

#### Removing StopWords with NLTK

In [13]:
import nltk
from nltk.corpus import stopwords

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amiralismac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
lst_stopwords = stopwords.words('english')

In [54]:
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
dataset['descr_clean'] = dataset.description.apply(lambda row: [porterstemmer.stem(word) for word in row.split(' ') if not word in lst_stopwords])

In [17]:
dataset.loc[0, ['descr_clean', 'descr_stem']]

descr_clean    [, , 3d, visualis, anim, play, role, mani, are...
descr_stem     [, , 3d, visualis, and, anim, play, a, role, i...
Name: 0, dtype: object

#### Removing Punctuation with NLTK

In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amiralismac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [55]:
import string

In [56]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
dataset.loc[0, 'description']

"  3D visualisation and animation play a role in many areas, and the popularity of these media just keeps growing. Digital animation provides the eye-catching special effects in the 21st century's favourite films and television shows; 3D design is also essential to everyday work in everything from computer games development, online virtual world development and industrial design to marketing, product design and architecture.GCU's programme in 3D Design for Virtual Environments will help you develop the skills to thrive in a successful career as a visual designer. The programme is practical and career-focused, oriented towards current industry needs, technology and practice. No prior knowledge of 3D design is required.  "

In [47]:
words = nltk.word_tokenize(dataset.loc[0, 'description'])
words

['3D',
 'visualisation',
 'and',
 'animation',
 'play',
 'a',
 'role',
 'in',
 'many',
 'areas',
 ',',
 'and',
 'the',
 'popularity',
 'of',
 'these',
 'media',
 'just',
 'keeps',
 'growing',
 '.',
 'Digital',
 'animation',
 'provides',
 'the',
 'eye-catching',
 'special',
 'effects',
 'in',
 'the',
 '21st',
 'century',
 "'s",
 'favourite',
 'films',
 'and',
 'television',
 'shows',
 ';',
 '3D',
 'design',
 'is',
 'also',
 'essential',
 'to',
 'everyday',
 'work',
 'in',
 'everything',
 'from',
 'computer',
 'games',
 'development',
 ',',
 'online',
 'virtual',
 'world',
 'development',
 'and',
 'industrial',
 'design',
 'to',
 'marketing',
 ',',
 'product',
 'design',
 'and',
 'architecture.GCU',
 "'s",
 'programme',
 'in',
 '3D',
 'Design',
 'for',
 'Virtual',
 'Environments',
 'will',
 'help',
 'you',
 'develop',
 'the',
 'skills',
 'to',
 'thrive',
 'in',
 'a',
 'successful',
 'career',
 'as',
 'a',
 'visual',
 'designer',
 '.',
 'The',
 'programme',
 'is',
 'practical',
 'and',
 '

In [61]:
for word_i in words:
    print(f"WORD: {word_i}\tisAlNum: {word_i.isalnum()}")


WORD: 3D	isAlNum: True
WORD: visualisation	isAlNum: True
WORD: and	isAlNum: True
WORD: animation	isAlNum: True
WORD: play	isAlNum: True
WORD: a	isAlNum: True
WORD: role	isAlNum: True
WORD: in	isAlNum: True
WORD: many	isAlNum: True
WORD: areas	isAlNum: True
WORD: ,	isAlNum: False
WORD: and	isAlNum: True
WORD: the	isAlNum: True
WORD: popularity	isAlNum: True
WORD: of	isAlNum: True
WORD: these	isAlNum: True
WORD: media	isAlNum: True
WORD: just	isAlNum: True
WORD: keeps	isAlNum: True
WORD: growing	isAlNum: True
WORD: .	isAlNum: False
WORD: Digital	isAlNum: True
WORD: animation	isAlNum: True
WORD: provides	isAlNum: True
WORD: the	isAlNum: True
WORD: eye-catching	isAlNum: False
WORD: special	isAlNum: True
WORD: effects	isAlNum: True
WORD: in	isAlNum: True
WORD: the	isAlNum: True
WORD: 21st	isAlNum: True
WORD: century	isAlNum: True
WORD: 's	isAlNum: False
WORD: favourite	isAlNum: True
WORD: films	isAlNum: True
WORD: and	isAlNum: True
WORD: television	isAlNum: True
WORD: shows	isAlNum: True
WO

In [62]:
[word_i for word_i in words if word_i.isalnum()]

['3D',
 'visualisation',
 'and',
 'animation',
 'play',
 'a',
 'role',
 'in',
 'many',
 'areas',
 'and',
 'the',
 'popularity',
 'of',
 'these',
 'media',
 'just',
 'keeps',
 'growing',
 'Digital',
 'animation',
 'provides',
 'the',
 'special',
 'effects',
 'in',
 'the',
 '21st',
 'century',
 'favourite',
 'films',
 'and',
 'television',
 'shows',
 '3D',
 'design',
 'is',
 'also',
 'essential',
 'to',
 'everyday',
 'work',
 'in',
 'everything',
 'from',
 'computer',
 'games',
 'development',
 'online',
 'virtual',
 'world',
 'development',
 'and',
 'industrial',
 'design',
 'to',
 'marketing',
 'product',
 'design',
 'and',
 'programme',
 'in',
 '3D',
 'Design',
 'for',
 'Virtual',
 'Environments',
 'will',
 'help',
 'you',
 'develop',
 'the',
 'skills',
 'to',
 'thrive',
 'in',
 'a',
 'successful',
 'career',
 'as',
 'a',
 'visual',
 'designer',
 'The',
 'programme',
 'is',
 'practical',
 'and',
 'oriented',
 'towards',
 'current',
 'industry',
 'needs',
 'technology',
 'and',
 'pract

In [50]:
dataset['descr_clean'] = dataset.description.apply(lambda row: [porterstemmer.stem(word) for word in nltk.word_tokenize(row) if not word in lst_stopwords and word.isalnum()])
dataset['descr_clean'].head

<bound method NDFrame.head of 0       [3d, visualis, anim, play, role, mani, area, p...
1       [busi, govern, reli, sound, financi, knowledg,...
2       [our, account, account, financi, manag, msc, c...
3       [embark, profession, account, career, academ, ...
4       [join, us, onlin, session, prospect, student, ...
                              ...                        
5995    [the, master, degre, materi, engin, interdisci...
5996    [the, msc, materi, engin, provid, deep, unders...
5997    [swansea, one, uk, lead, centr, materi, teach,...
5998    [our, msc, materi, engin, industri, cours, ope...
5999    [regist, interest, graduat, studi, uclth, glob...
Name: descr_clean, Length: 5980, dtype: object>

In [53]:
dataset.loc[0, 'descr_clean']

['3d',
 'visualis',
 'anim',
 'play',
 'role',
 'mani',
 'area',
 'popular',
 'media',
 'keep',
 'grow',
 'digit',
 'anim',
 'provid',
 'special',
 'effect',
 '21st',
 'centuri',
 'favourit',
 'film',
 'televis',
 'show',
 '3d',
 'design',
 'also',
 'essenti',
 'everyday',
 'work',
 'everyth',
 'comput',
 'game',
 'develop',
 'onlin',
 'virtual',
 'world',
 'develop',
 'industri',
 'design',
 'market',
 'product',
 'design',
 'programm',
 '3d',
 'design',
 'virtual',
 'environ',
 'help',
 'develop',
 'skill',
 'thrive',
 'success',
 'career',
 'visual',
 'design',
 'the',
 'programm',
 'practic',
 'orient',
 'toward',
 'current',
 'industri',
 'need',
 'technolog',
 'practic',
 'no',
 'prior',
 'knowledg',
 '3d',
 'design',
 'requir']

#### SpaCy

In [22]:
import spacy

In [23]:
nlp = spacy.load("en_core_web_sm")

In [24]:
doc = nlp(dataset.loc[0, 'description'])

In [64]:
doc

  3D visualisation and animation play a role in many areas, and the popularity of these media just keeps growing. Digital animation provides the eye-catching special effects in the 21st century's favourite films and television shows; 3D design is also essential to everyday work in everything from computer games development, online virtual world development and industrial design to marketing, product design and architecture.GCU's programme in 3D Design for Virtual Environments will help you develop the skills to thrive in a successful career as a visual designer. The programme is practical and career-focused, oriented towards current industry needs, technology and practice. No prior knowledge of 3D design is required.  

In [63]:
for ent_i in doc.ents:
    print(f"TEXT: {ent_i.text}, START_CHAR: {ent_i.start_char}, END_CHAR: {ent_i.end_char}, LABEL_: {ent_i.label_}")

TEXT: Digital, START_CHAR: 114, END_CHAR: 121, LABEL_: ORG
TEXT: the 21st century's, START_CHAR: 177, END_CHAR: 195, LABEL_: DATE
TEXT: GCU, START_CHAR: 427, END_CHAR: 430, LABEL_: ORG
TEXT: 3D Design for Virtual Environments, START_CHAR: 446, END_CHAR: 480, LABEL_: ORG


In [67]:
for token in doc:
    print(token, token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

         SPACE _SP dep    False False
3D 3D 3d ADJ JJ amod dX False False
visualisation visualisation visualisation NOUN NN nsubj xxxx True False
and and and CCONJ CC cc xxx True True
animation animation animation NOUN NN conj xxxx True False
play play play VERB VBP ROOT xxxx True False
a a a DET DT det x True True
role role role NOUN NN dobj xxxx True False
in in in ADP IN prep xx True True
many many many ADJ JJ amod xxxx True True
areas areas area NOUN NNS pobj xxxx True False
, , , PUNCT , punct , False False
and and and CCONJ CC cc xxx True True
the the the DET DT det xxx True True
popularity popularity popularity NOUN NN nsubj xxxx True False
of of of ADP IN prep xx True True
these these these DET DT det xxxx True True
media media medium NOUN NNS pobj xxxx True False
just just just ADV RB advmod xxxx True True
keeps keeps keep VERB VBZ conj xxxx True False
growing growing grow VERB VBG xcomp xxxx True False
. . . PUNCT . punct . False False
Digital Digital digital ADJ JJ amod Xxxx

### 2.0.1 Preprocessing the fees column

Moreover, we want the field fees to collect numeric information. As you will see, you scraped textual information for this attribute in the dataset: sketch whatever method you need (using regex, for example, to find currency symbol) to collect information and, in case of multiple information, retrieve only the highest fees. Finally, once you have collected numerical information, you likely will have different currencies: this can be chaotic, so let chatGPT guide you in the choice and deployment of an API to convert this column to a common currency of your choice (it can be USD, EUR or whatever you want). Ultimately, you will have a float column renamed fees (CHOSEN COMMON CURRENCY).

In [72]:
counter = 0
for fee_i in dataset.fee:
    print(fee_i)
    counter += 1
    if counter == 10:
        break


Fees

Please see the university website for further information on fees for this course.



Fees

UK: £18,000 (Total) International: £34,750 (Total)



Fees

Please see the university website for further information on fees for this course.



Fees

Please see the university website for further information on fees for this course.



Fees

Please see the university website for further information on fees for this course.



Fees

UK: £13,750 (Total) International: £31,000 (Total)



Fees

Please see the university website for further information on fees for this course.



Fees

Tuition fee per year (non-EU/EEA students): 15000 €



Fees

Tuition fee per year (non-EU/EEA students): 15000 €



Fees

UK: £12,500 (Total) International: £28,750 (Total)




In [79]:
dataset.fee.iloc[0]

'\nFees\n\nPlease see the university website for further information on fees for this course.\n\n'

##### Making API requests to exchangeratesapi.io for Conversion Rates

In [85]:
pip install git+https://github.com/everapihq/freecurrencyapi-python.git

Collecting git+https://github.com/everapihq/freecurrencyapi-python.git
  Cloning https://github.com/everapihq/freecurrencyapi-python.git to /private/var/folders/zk/kfcj5gj53s9dcj21xxz5w1q00000gn/T/pip-req-build-1vdhuebc
  Running command git clone --filter=blob:none --quiet https://github.com/everapihq/freecurrencyapi-python.git /private/var/folders/zk/kfcj5gj53s9dcj21xxz5w1q00000gn/T/pip-req-build-1vdhuebc
  Resolved https://github.com/everapihq/freecurrencyapi-python.git to commit c32c4dc9df5de4cf0d4940c0c27efc5fc8473bdf
  Preparing metadata (setup.py) ... [?25ldone
Collecting everapi (from freecurrencyapi==0.1.0)
  Downloading everapi-0.1.1-py3-none-any.whl.metadata (1.4 kB)
Downloading everapi-0.1.1-py3-none-any.whl (2.9 kB)
Building wheels for collected packages: freecurrencyapi
  Building wheel for freecurrencyapi (setup.py) ... [?25ldone
[?25h  Created wheel for freecurrencyapi: filename=freecurrencyapi-0.1.0-py3-none-any.whl size=2611 sha256=e97251e159da9318b236f8c7478a59af3

In [86]:
import freecurrencyapi
client = freecurrencyapi.Client('fca_live_gN70PpWZqBXZc2mLgxw9mcWxOAnrO1el551fuHET')

In [88]:

print(client.status())

{'account_id': 247677987040923648, 'quotas': {'month': {'total': 5000, 'used': 0, 'remaining': 5000}, 'grace': {'total': 0, 'used': 0, 'remaining': 0}}}


In [91]:

rates = client.latest()
print(rates)

{'data': {'AUD': 1.5386502584, 'BGN': 1.8162002721, 'BRL': 4.8639405805, 'CAD': 1.3700402396, 'CHF': 0.8889901034, 'CNY': 7.2548508048, 'CZK': 22.4500528051, 'DKK': 6.8562610465, 'EUR': 0.9189601345, 'GBP': 0.8004900959, 'HKD': 7.8050014476, 'HRK': 7.0437510499, 'HUF': 345.0866708549, 'IDR': 15680.016090975, 'ILS': 3.7519504759, 'INR': 82.9664511114, 'ISK': 141.1306261783, 'JPY': 150.4939192344, 'KRW': 1303.6488540011, 'MXN': 17.3461517778, 'MYR': 4.6914305419, 'NOK': 10.8720511635, 'NZD': 1.6651002208, 'PHP': 56.0540068086, 'PLN': 4.0365005548, 'RON': 4.569530519, 'RUB': 90.5774804012, 'SEK': 10.5880413709, 'SGD': 1.3492302402, 'THB': 35.5156760279, 'TRY': 28.605793501, 'USD': 1, 'ZAR': 18.2329328243}}


##### Extracting & Converting Fees

In [98]:
import re

In [99]:
currency_patterns = {
    'USD': r'\$(\d+(?:,\d{3})*(?:\.\d+)?)',
    'EUR': r'€(\d+(?:,\d{3})*(?:\.\d+)?)',
    'JPY': r'¥(\d+(?:,\d{3})*(?:\.\d+)?)',
    'GBP': r'£(\d+(?:,\d{3})*(?:\.\d+)?)',
    'AUD': r'A\$(\d+(?:,\d{3})*(?:\.\d+)?)',
    'CAD': r'C\$(\d+(?:,\d{3})*(?:\.\d+)?)',
    'CHF': r'Fr(\d+(?:,\d{3})*(?:\.\d+)?)',
    'CNY': r'¥(\d+(?:,\d{3})*(?:\.\d+)?)',  # Same symbol as JPY, context needed to differentiate
    'HKD': r'HK\$(\d+(?:,\d{3})*(?:\.\d+)?)',
    'NZD': r'NZ\$(\d+(?:,\d{3})*(?:\.\d+)?)'
}

In [105]:
def extract_and_convert_fees(row, rates):
    max_fee_usd = 0  # Initialize the maximum fee variable
    # Define the currency symbols which will be used to identify the currency in the regex pattern.
    currency_symbols = {
        'USD': r'\$',
        'EUR': r'€',
        'JPY': r'¥',
        'GBP': r'£',
        'AUD': r'A\$',
        'CAD': r'C\$',
        'CHF': r'Fr',
        'CNY': r'¥',  # The same symbol as JPY, might need additional context to differentiate
        'HKD': r'HK\$',
        'NZD': r'NZ\$'
    }

    # Regex to find numbers - this is a simplistic pattern
    # For real-world usage, you might need a more sophisticated regex
    for currency, symbol in currency_symbols.items():
        pattern = rf'{symbol}(\d+(?:,\d{{3}})*(\.\d+)?)'
        matches = re.findall(pattern, row)
        for match in matches:
            # Remove commas and convert to float
            num = float(match[0].replace(',', ''))
            # Multiply by the conversion rate to get the amount in USD
            if currency in rates['data']:
                num *= rates['data'][currency]
                max_fee_usd = max(max_fee_usd, num)

    return max_fee_usd




In [106]:
# Example usage:
row = "The tuition fee is €15,000 or £13,000 based on the current exchange rates."

# Calculate the fee in USD
fee_in_usd = extract_and_convert_fees(row, rates)
print(f"The fee in USD is: {fee_in_usd}")

The fee in USD is: 13784.4020175


In [117]:
# This will apply the function to the first row's 'fee' value only
fee_in_usd = extract_and_convert_fees(dataset['fee'].iloc[5], rates)
print(fee_in_usd)


24815.1929729


In [120]:
import numpy as np

In [122]:

# Apply the function to every row, handling floats by converting them to strings
dataset['fees_in_usd'] = dataset['fee'].apply(lambda row: extract_and_convert_fees(str(row), rates) if not pd.isnull(row) else np.nan)


In [126]:
dataset.head(10)

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fee,modality,duration,city,country,administration,url,descr_stem,descr_clean,fees_in_usd
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,True,3D visualisation and animation play a role i...,September,\nFees\n\nPlease see the university website fo...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , 3d, visualis, and, anim, play, a, role, i...","[3d, visualis, anim, play, role, mani, area, p...",0.0
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,True,Businesses and governments rely on sound fin...,September,"\nFees\n\nUK: £18,000 (Total) International: £...",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , busi, and, govern, reli, on, sound, finan...","[busi, govern, reli, sound, financi, knowledg,...",27817.030833
2,"Accounting, Accountability & Financial Manag...",King’s College London,King’s Business School,True,"Our Accounting, Accountability & Financial M...",September,\nFees\n\nPlease see the university website fo...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , our, accounting,, account, &, financi, ma...","[our, account, account, financi, manag, msc, c...",0.0
3,"Accounting, Financial Management and Digital...",University of Reading,Henley Business School,True,Embark on a professional accounting career w...,September,\nFees\n\nPlease see the university website fo...,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , embark, on, a, profession, account, caree...","[embark, profession, account, career, academ, ...",0.0
4,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",True,Join us for an online session for prospectiv...,September,\nFees\n\nPlease see the university website fo...,MSc,One year FT,London,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , join, us, for, an, onlin, session, for, p...","[join, us, onlin, session, prospect, student, ...",0.0
5,Advanced Chemical Engineering - MSc,University of Leeds,School of Chemical and Process Engineering,True,The Advanced Chemical Engineering MSc at Lee...,September,"\nFees\n\nUK: £13,750 (Total) International: £...",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , the, advanc, chemic, engin, msc, at, leed...","[the, advanc, chemic, engin, msc, leed, build,...",24815.192973
6,Advanced Physiotherapy Practice - MSc,Glasgow Caledonian University,School of Health and Life Sciences,True,Progress your career as a physiotherapist wi...,"January, September",\nFees\n\nPlease see the university website fo...,MSc,1 Year Full Time / 2-3 Years Part Time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , progress, your, career, as, a, physiother...","[progress, career, physiotherapist, within, nh...",0.0
7,Agricultural Sciences - MSc (Agriculture and...,University of Helsinki,International Masters Degree Programmes,True,Goal of the pro­grammeWould you like to be i...,September,\nFees\n\nTuition fee per year (non-EU/EEA stu...,MSc,2 years,Helsinki,Finland,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , goal, of, the, pro­grammewould, you, like...","[goal, like, involv, find, solut, futur, chall...",0.0
8,"Agricultural, Environmental and Resource Eco...",University of Helsinki,International Masters Degree Programmes,True,Goal of the pro­grammeAre you looking forwar...,September,\nFees\n\nTuition fee per year (non-EU/EEA stu...,MSc,2 years,Helsinki,Finland,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , goal, of, the, pro­grammear, you, look, f...","[goal, look, forward, futur, expert, agricultu...",0.0
9,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,True,Up to 7 million people are estimated to die ...,September,"\nFees\n\nUK: £12,500 (Total) International: £...",MSc,"1 year full time, 2 or 3 years part-time",Leeds,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , up, to, 7, million, peopl, are, estim, to...","[up, 7, million, peopl, estim, die, everi, yea...",23014.090257


### 2.1. Conjunctive query

For the first version of the search engine, we narrowed our interest to the description of each course. It means that you will evaluate queries only concerning the course's description.

#### 2.1.1 Create your index!

Before building the index,

Create a file named vocabulary, in the format you prefer, that maps each word to an integer (term_id).
Then, the first brick of your homework is to create the Inverted Index. It will be a dictionary in this format:

{
term_id_1:[document_1, document_2, document_4],
term_id_2:[document_1, document_3, document_5, document_6],
...}
where document_i is the id of a document that contains that specific word.

Hint: Since you do not want to compute the inverted index every time you use the Search Engine, it is worth thinking about storing it in a separate file and loading it in memory when needed.

In [127]:
dataset.descr_clean

0       [3d, visualis, anim, play, role, mani, area, p...
1       [busi, govern, reli, sound, financi, knowledg,...
2       [our, account, account, financi, manag, msc, c...
3       [embark, profession, account, career, academ, ...
4       [join, us, onlin, session, prospect, student, ...
                              ...                        
5995    [the, master, degre, materi, engin, interdisci...
5996    [the, msc, materi, engin, provid, deep, unders...
5997    [swansea, one, uk, lead, centr, materi, teach,...
5998    [our, msc, materi, engin, industri, cours, ope...
5999    [regist, interest, graduat, studi, uclth, glob...
Name: descr_clean, Length: 5980, dtype: object

##### Approach 1: Using Python Sets

In [27]:
vocabulary = set()

In [28]:
dataset.descr_clean.apply(lambda row: [vocabulary.add(word) for word in row])

0       [None, None, None, None, None, None, None, Non...
1       [None, None, None, None, None, None, None, Non...
2       [None, None, None, None, None, None, None, Non...
3       [None, None, None, None, None, None, None, Non...
4       [None, None, None, None, None, None, None, Non...
                              ...                        
5995    [None, None, None, None, None, None, None, Non...
5996    [None, None, None, None, None, None, None, Non...
5997    [None, None, None, None, None, None, None, Non...
5998    [None, None, None, None, None, None, None, Non...
5999    [None, None, None, None, None, None, None, Non...
Name: descr_clean, Length: 5980, dtype: object

In [29]:
vocabulary

{'measurementy',
 'chatham',
 'emml',
 'cater',
 'instituteth',
 'immigr',
 'globalis',
 'knowledgedesign',
 'afternoon',
 'besteht',
 'peke',
 'psychophys',
 'micromobilitymast',
 'ku',
 'everard',
 'educationth',
 'paceaccredit',
 'programmeai',
 'fraction',
 'voce',
 'nutraceut',
 'daili',
 'protein',
 'lightweight',
 'mse',
 'encompass',
 'knowedeg',
 'geometr',
 'straight',
 'antisoci',
 'client',
 '65',
 'deadlin',
 'ration',
 'informationdiploma',
 '2',
 'warfar',
 'worldwidea',
 'nanotechnologygradu',
 'byte',
 'introduc',
 'plasma',
 'satisfactionn',
 '1971',
 'novic',
 'script',
 'missionth',
 'experimentsth',
 'commonli',
 'homewar',
 'mbda',
 'am',
 'associ',
 'era',
 'analysesdur',
 'eastman',
 'youfield',
 'microorgan',
 'declin',
 'psychologycognit',
 'expedit',
 '92',
 'oulu',
 'risk',
 'exorc',
 'financei',
 'switch',
 'acut',
 'côte',
 'descriptionglob',
 'competitor',
 'far',
 'marktsituationen',
 'pharmaceut',
 'gkn',
 'sabancı',
 'hurdl',
 'overviewinnov',
 'equit'

##### Approach 2: Using Python Collections

In [30]:
from collections import Counter
from functools import reduce

In [31]:
vocabulary_alt = Counter(reduce(lambda x, y: x + y, dataset.descr_clean.values)).keys()

In [128]:
vocabulary_alt

dict_keys(['3d', 'visualis', 'anim', 'play', 'role', 'mani', 'area', 'popular', 'media', 'keep', 'grow', 'digit', 'provid', 'special', 'effect', '21st', 'centuri', 'favourit', 'film', 'televis', 'show', 'design', 'also', 'essenti', 'everyday', 'work', 'everyth', 'comput', 'game', 'develop', 'onlin', 'virtual', 'world', 'industri', 'market', 'product', 'programm', 'environ', 'help', 'skill', 'thrive', 'success', 'career', 'visual', 'the', 'practic', 'orient', 'toward', 'current', 'need', 'technolog', 'no', 'prior', 'knowledg', 'requir', 'busi', 'govern', 'reli', 'sound', 'financi', 'underpin', 'strategi', 'cours', 'profession', 'advanc', 'modern', 'theori', 'account', 'control', 'well', 'understand', 'organis', 'cover', 'fundament', 'topic', 'corpor', 'financ', 'report', 'option', 'modul', 'allow', 'specialis', 'rang', 'intern', 'forens', 'you', 'abl', 'appli', 'learn', 'case', 'studi', 'simul', 'test', 'genuin', 'scenario', 'our', 'manag', 'msc', 'econom', 'technic', 'institut', 'found

##### Assign Unique ID to each item in the Vocab

###### Approach 1: Using For-Loop

In [32]:
index = {}
unique_id = 1
for word in list(vocabulary_alt):
  index[unique_id] = word
  unique_id+=1

In [33]:
index

{1: '3d',
 2: 'visualis',
 3: 'anim',
 4: 'play',
 5: 'role',
 6: 'mani',
 7: 'area',
 8: 'popular',
 9: 'media',
 10: 'keep',
 11: 'grow',
 12: 'digit',
 13: 'provid',
 14: 'special',
 15: 'effect',
 16: '21st',
 17: 'centuri',
 18: 'favourit',
 19: 'film',
 20: 'televis',
 21: 'show',
 22: 'design',
 23: 'also',
 24: 'essenti',
 25: 'everyday',
 26: 'work',
 27: 'everyth',
 28: 'comput',
 29: 'game',
 30: 'develop',
 31: 'onlin',
 32: 'virtual',
 33: 'world',
 34: 'industri',
 35: 'market',
 36: 'product',
 37: 'programm',
 38: 'environ',
 39: 'help',
 40: 'skill',
 41: 'thrive',
 42: 'success',
 43: 'career',
 44: 'visual',
 45: 'the',
 46: 'practic',
 47: 'orient',
 48: 'toward',
 49: 'current',
 50: 'need',
 51: 'technolog',
 52: 'no',
 53: 'prior',
 54: 'knowledg',
 55: 'requir',
 56: 'busi',
 57: 'govern',
 58: 'reli',
 59: 'sound',
 60: 'financi',
 61: 'underpin',
 62: 'strategi',
 63: 'cours',
 64: 'profession',
 65: 'advanc',
 66: 'modern',
 67: 'theori',
 68: 'account',
 69:

###### Approach 2: Using Pandas DataFrames

In [140]:
terms = pd.DataFrame(data=list(vocabulary_alt), columns=['term'])

In [141]:
terms

Unnamed: 0,term
0,3d
1,visualis
2,anim
3,play
4,role
...,...
8760,descriptionmateri
8761,microelectronicspolym
8762,sciencecatalysisnanotechnologythey
8763,programmeyour


In [35]:
terms.loc[0,'term']

'3d'

##### Creating the Inverted Index 

In [130]:
# Question: Which documents (from 0 to 5999) contain the word "3D" in them?
# PS: terms.loc[0, "term"] is "3D"
dataset.descr_clean.apply(lambda row: terms.loc[0, "term"] in row)

0        True
1       False
2       False
3       False
4       False
        ...  
5995    False
5996    False
5997    False
5998    False
5999    False
Name: descr_clean, Length: 5980, dtype: bool

In [131]:
# Using a mask to see exactly which documents contain the word "3D"
dataset.index[dataset.descr_clean.apply(lambda row: terms.loc[0, "term"] in row)]

Index([   0,  432,  496,  581,  582,  880, 1829, 2430, 2828, 2830, 2831, 3913,
       4002, 5018, 5777],
      dtype='int64')

In [137]:
# Again, we use the same mask, but this time, in order to see courseName and description
dataset[["courseName", "description"]][dataset.descr_clean.apply(lambda row: terms.loc[0, "term"] in row)]

Unnamed: 0,courseName,description
0,3D Design for Virtual Environments - MSc,3D visualisation and animation play a role i...
432,Physics - MSc,This course is for you if you’re interested ...
496,3D Computer Games Design - MSc,Get a flying start in the games industry. If...
581,Additive Manufacturing - MSc,"Additive Manufacturing, or 3D printing, is a..."
582,Additive Manufacturing and 3D Printing MSc,Whether it is creating advanced engineering ...
880,Advanced Product Design Engineering & Manufa...,About this courseThis course examines the la...
1829,Chemistry - MPhil/MSc (Research),Our research groups are involved in projects...
2430,Creative Technologies (MSc),Combine your passion for creative art and de...
2828,Digital Anthropology MSc,Register your interest in graduate study at ...
2830,Digital Audio Engineering - MSc,"On our MSc in Digital Audio Engineering, you..."


In [36]:
from tqdm.notebook import tqdm

tqdm.pandas()

In [164]:

terms['reverse'] = terms.term.progress_apply(lambda item: list(dataset.loc[dataset.descr_clean.apply(lambda row: item in row)].index))

  0%|          | 0/8765 [00:00<?, ?it/s]

In [165]:
terms.loc[0:10]

Unnamed: 0,term,reverse
0,3d,"[0, 432, 496, 581, 582, 880, 1829, 2430, 2828,..."
1,visualis,"[0, 66, 68, 387, 729, 1068, 1273, 1289, 1420, ..."
2,anim,"[0, 7, 18, 25, 109, 156, 239, 605, 669, 959, 1..."
3,play,"[0, 14, 31, 68, 78, 180, 192, 267, 296, 309, 3..."
4,role,"[0, 14, 33, 59, 68, 70, 72, 78, 105, 111, 136,..."
5,mani,"[0, 11, 32, 95, 103, 109, 110, 118, 119, 124, ..."
6,area,"[0, 2, 13, 15, 18, 42, 57, 59, 60, 64, 83, 85,..."
7,popular,"[0, 411, 496, 508, 1288, 1477, 1940, 2045, 204..."
8,media,"[0, 31, 140, 149, 189, 218, 219, 220, 322, 373..."
9,keep,"[0, 236, 284, 360, 519, 718, 873, 965, 966, 98..."


In [142]:
# Save the DataFrame to a CSV file
terms.to_csv('inverted_index.csv', index=False)

#### Executing the Query

In [171]:
def execute_query(query, inverted_index, dataset):
    # Split the query into terms and stem each term
    query_terms = [stemmer.stem(word) for word in query.lower().split()]
    
    # Retrieve the list of document indices for each query term
    list_of_documents = [set(inverted_index[inverted_index['term'] == term_i]['reverse'].values[0])
                         for term_i in query_terms if term_i in inverted_index['term'].values]
    
    # Intersect the document lists to find documents that contain all query terms
    if list_of_documents:
        docs_containing_all_terms = set.intersection(*list_of_documents)
        # Convert set to list before using it as an indexer
        docs_containing_all_terms = list(docs_containing_all_terms)
    else:
        docs_containing_all_terms = []

    results = dataset.loc[docs_containing_all_terms, ['courseName', 'universityName', 'description', 'url']]
    
    return results.reset_index(drop=True)


In [158]:
terms[terms.term == "architect"]

Unnamed: 0,term
476,architect


In [161]:
# Check if 'architecture' exists in the terms DataFrame
print('architect' in terms.term.values)  # Should be True if it exists


True


In [166]:
terms.head()

Unnamed: 0,term,reverse
0,3d,"[0, 432, 496, 581, 582, 880, 1829, 2430, 2828,..."
1,visualis,"[0, 66, 68, 387, 729, 1068, 1273, 1289, 1420, ..."
2,anim,"[0, 7, 18, 25, 109, 156, 239, 605, 669, 959, 1..."
3,play,"[0, 14, 31, 68, 78, 180, 192, 267, 296, 309, 3..."
4,role,"[0, 14, 33, 59, 68, 70, 72, 78, 105, 111, 136,..."


In [162]:
# Check the column names in the terms DataFrame
print(terms.columns)


Index(['term'], dtype='object')


In [176]:
# Example usage:
sample_query = "advanced knowledge"
query_results = execute_query(sample_query, terms, dataset)
(query_results)

Unnamed: 0,courseName,universityName,description,url
0,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound fin...,https://www.findamasters.com//masters-degrees/...
1,Global Meetings and Events Management MSc / ...,University College Birmingham,Become part of an events industry worth an e...,https://www.findamasters.com//masters-degrees/...
2,Addictions MSc,King’s College London,Join us for an online session for prospectiv...,https://www.findamasters.com//masters-degrees/...
3,Analytical Toxicology MSc,King’s College London,The Analytical Toxicology MSc is a unique st...,https://www.findamasters.com//masters-degrees/...
4,Clinical Research - MSc,Cardiff University,"Why study this courseThis part-time, distanc...",https://www.findamasters.com//masters-degrees/...
...,...,...,...,...
463,Clinical Geriatrics - MSc,Cardiff University,Why study this courseThe MSc Clinical Geriat...,https://www.findamasters.com//masters-degrees/...
464,Clinical Neuropsychology - MSc,University of Bristol,Professional programmes in Clinical Neuropsy...,https://www.findamasters.com//masters-degrees/...
465,Clinical Ophthalmic Practice MSc,University College London,Register your interest in graduate study at ...,https://www.findamasters.com//masters-degrees/...
466,Clinical Optometry - MSc,Cardiff University,Why study this courseThe aim of this program...,https://www.findamasters.com//masters-degrees/...


### 2.2 Conjunctive query & Ranking score

#### TF-IDF

In [177]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [178]:
tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text) # , max_df=0.1

In [179]:
results = tfidf.fit_transform(dataset.descr_clean)



In [180]:
results

<5980x8765 sparse matrix of type '<class 'numpy.float64'>'
	with 240162 stored elements in Compressed Sparse Row format>

In [181]:
result_dense = results.todense()

In [220]:
result_dense

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
tfidf_data = pd.DataFrame(result_dense.tolist(), index=dataset.index, columns=tfidf.get_feature_names_out())

In [44]:
len(tfidf_data)

5980

In [183]:
tfidf_data

Unnamed: 0,1,10,100,1000,104k,11,11th,12,120,125k,...,zoonosi,zoonot,zu,zudem,zum,zur,zurich,zwingen,école,ísafjörður
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125514,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Just to get a sense of what the dataframe looks like, we can take a look at some of its middle columns

In [206]:
tfidf_data.iloc[:, 1150:1165]

Unnamed: 0,brew,breweri,brexit,bric,bridg,brief,briefconceptu,briefth,bright,brightest,brighton,brilliant,brim,bring,bristol
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [217]:
tfidf_data.loc[terms.term == "3d"]["3d"]

0    0.569024
Name: 3d, dtype: float64

We know that in index=0, the word "3d" has been used a lot. So, let's take a look at how many times it has been used:

In [221]:
Counter(dataset.loc[0, 'descr_clean'])['3d']

4

We know that in index=13, the word "you" has been used once. So, let's take a look at that:

In [185]:
Counter(dataset.loc[13, 'descr_clean'])['you']

1

If we look for the word "finance" in tfidf_data, we can see that it has a 0.0 TFIDF score in most of the documents:

In [225]:
# Obviously we cannot mention "finance" exactly as is, but we have to stemm it first into "financ"
tfidf_data.financ

0       0.000000
1       0.218229
2       0.000000
3       0.000000
4       0.000000
          ...   
5995    0.000000
5996    0.000000
5997    0.000000
5998    0.000000
5999    0.000000
Name: financ, Length: 5980, dtype: float64

So, let's take a look at all the documents in which the word "design" has been used:

In [226]:
tfidf_data.loc[tfidf_data.financ > 0, ['financ']]

Unnamed: 0,financ
1,0.218229
12,0.206567
18,0.115327
24,0.115336
47,0.083981
...,...
5958,0.199036
5974,0.273931
5981,0.215126
5982,0.183679


In [227]:
dataset[tfidf_data.financ > 0][["courseName", "description"]]

Unnamed: 0,courseName,description
1,Accounting and Finance - MSc,Businesses and governments rely on sound fin...
12,Applied Economics (Banking and Financial Mar...,From political uncertainty to finance and re...
18,Applied Statistics (online) MSc,Our online MSc in Applied Statistics is a co...
24,Banking and International Finance - MSc,Integrated financial markets and global busi...
47,"Climate Change, Management and Finance - MSc","MSc Climate Change, Management & Finance is ..."
...,...,...
5958,Master's in Financial Economics,The Financial Economics major at NHH prepare...
5974,Masters in Mathematical Finance,OBJECTIVESThe Masters in Mathematical Financ...
5981,Masters of Finance,The HKU Business School Master of Finance (M...
5982,Masters Of Finance (International Finance),Master’s in Finance (International Finance) ...


We can see that the word "you" has been used in a lot of documents, but with a relativley low TFIDF score:

In [235]:
tfidf_data.loc[tfidf_data["you"] > 0]["you"]

1       0.081878
8       0.064950
11      0.079859
13      0.108566
18      0.086540
          ...   
5936    0.038171
5946    0.140265
5964    0.108300
5966    0.101255
5972    0.066377
Name: you, Length: 1159, dtype: float64

In [199]:
terms.loc[0:15]

Unnamed: 0,term,reverse
0,3d,"[0, 432, 496, 581, 582, 880, 1829, 2430, 2828,..."
1,visualis,"[0, 66, 68, 387, 729, 1068, 1273, 1289, 1420, ..."
2,anim,"[0, 7, 18, 25, 109, 156, 239, 605, 669, 959, 1..."
3,play,"[0, 14, 31, 68, 78, 180, 192, 267, 296, 309, 3..."
4,role,"[0, 14, 33, 59, 68, 70, 72, 78, 105, 111, 136,..."
5,mani,"[0, 11, 32, 95, 103, 109, 110, 118, 119, 124, ..."
6,area,"[0, 2, 13, 15, 18, 42, 57, 59, 60, 64, 83, 85,..."
7,popular,"[0, 411, 496, 508, 1288, 1477, 1940, 2045, 204..."
8,media,"[0, 31, 140, 149, 189, 218, 219, 220, 322, 373..."
9,keep,"[0, 236, 284, 360, 519, 718, 873, 965, 966, 98..."


In [196]:
terms.loc[terms.term == '21st']

Unnamed: 0,term,reverse
15,21st,"[0, 31, 51, 116, 192, 216, 288, 322, 347, 465,..."


In [195]:
tfidf_data.loc[[0, 15]]

Unnamed: 0,1,10,100,1000,104k,11,11th,12,120,125k,...,zoonosi,zoonot,zu,zudem,zum,zur,zurich,zwingen,école,ísafjörður
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.140447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
dataset['descr_len'] = dataset.descr_clean.apply(lambda row: len(row))

In [201]:
dataset.loc[[1, 2, 9, 12, 14], 'descr_len']

1     55
2     52
9     46
12    52
14    56
Name: descr_len, dtype: int64

### Cosine Similarity

In [210]:
from sklearn.metrics.pairwise import cosine_similarity

In [211]:
cossim_data = pd.DataFrame(cosine_similarity(tfidf_data), index=dataset.index, columns=dataset.index)

  ret = a @ b


In [212]:
cossim_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,1.000000,0.038010,0.048749,0.045112,0.031504,0.070991,0.026884,0.043068,0.013144,0.023367,...,0.027649,0.005316,0.065698,0.064964,0.044441,0.076865,0.014964,0.013950,0.048222,0.017847
1,0.038010,1.000000,0.302218,0.361394,0.032513,0.049519,0.045223,0.004829,0.066927,0.016458,...,0.013984,0.032099,0.052911,0.046705,0.013399,0.047516,0.021145,0.020948,0.024117,0.013502
2,0.048749,0.302218,1.000000,0.341424,0.028349,0.077502,0.057765,0.018963,0.054127,0.011017,...,0.010571,0.000000,0.023496,0.062440,0.044861,0.014731,0.038851,0.010637,0.054440,0.024251
3,0.045112,0.361394,0.341424,1.000000,0.016908,0.019696,0.026145,0.044687,0.063590,0.014284,...,0.046491,0.000000,0.023494,0.047711,0.020914,0.010460,0.029949,0.032271,0.044199,0.042543
4,0.031504,0.032513,0.028349,0.016908,1.000000,0.052979,0.049981,0.055342,0.029268,0.033115,...,0.021191,0.005535,0.081155,0.027960,0.011512,0.042026,0.012101,0.005977,0.012430,0.011816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.076865,0.047516,0.014731,0.010460,0.042026,0.102695,0.038674,0.124642,0.045221,0.019627,...,0.111167,0.179383,0.197414,0.212213,0.028722,1.000000,0.322846,0.260022,0.251597,0.100595
5996,0.014964,0.021145,0.038851,0.029949,0.012101,0.090559,0.032857,0.031833,0.032581,0.003036,...,0.101633,0.178009,0.266572,0.267922,0.018968,0.322846,1.000000,0.570156,0.511268,0.124053
5997,0.013950,0.020948,0.010637,0.032271,0.005977,0.108051,0.059029,0.016295,0.025191,0.016034,...,0.083393,0.139677,0.207399,0.307376,0.030500,0.260022,0.570156,1.000000,0.424215,0.079815
5998,0.048222,0.024117,0.054440,0.044199,0.012430,0.093003,0.072525,0.021222,0.005804,0.017846,...,0.073946,0.112468,0.217615,0.223854,0.022385,0.251597,0.511268,0.424215,1.000000,0.092951


In [213]:
tfidf_data.loc[tfidf_data.design > 0].index

Index([   0,    5,   13,   14,   15,   16,   21,   30,   33,   36,
       ...
       5960, 5961, 5965, 5966, 5970, 5972, 5983, 5984, 5992, 5995],
      dtype='int64', length=1584)

In [218]:
cossim_data[tfidf_data.loc[tfidf_data.design > 0.1].index]

Unnamed: 0,0,5,13,14,15,21,52,56,70,98,...,5724,5754,5763,5775,5799,5845,5849,5877,5909,5984
0,1.000000,0.070991,0.080771,0.071550,0.083378,0.062496,0.047464,0.127848,0.057677,0.088357,...,0.050967,0.061924,0.100205,0.112015,0.135431,0.096947,0.073393,0.051752,0.082734,0.112482
1,0.038010,0.049519,0.049473,0.052993,0.022801,0.074989,0.035768,0.025949,0.037889,0.116849,...,0.005560,0.017640,0.123020,0.000000,0.026501,0.026935,0.114271,0.015309,0.068436,0.021394
2,0.048749,0.077502,0.010093,0.000000,0.048475,0.022459,0.007327,0.066435,0.091160,0.071870,...,0.094518,0.025862,0.104621,0.049322,0.024293,0.023188,0.015149,0.051501,0.036187,0.057249
3,0.045112,0.019696,0.056013,0.000000,0.032276,0.062250,0.076139,0.068985,0.041752,0.158696,...,0.088312,0.021279,0.078139,0.006374,0.051579,0.006611,0.115227,0.086186,0.092407,0.057193
4,0.031504,0.052979,0.023226,0.001770,0.046639,0.036795,0.085018,0.036843,0.014716,0.051422,...,0.022506,0.048985,0.041245,0.023989,0.038508,0.025799,0.015416,0.031803,0.047077,0.029894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.076865,0.102695,0.064147,0.056743,0.016606,0.055098,0.038454,0.112783,0.055933,0.102503,...,0.087572,0.051114,0.071252,0.101965,0.085270,0.100250,0.065549,0.045449,0.059762,0.090707
5996,0.014964,0.090559,0.011239,0.017958,0.030848,0.015949,0.010025,0.021910,0.034846,0.105660,...,0.073438,0.014404,0.059329,0.012361,0.031445,0.016957,0.051101,0.018381,0.087339,0.028789
5997,0.013950,0.108051,0.066143,0.008555,0.020042,0.013218,0.023986,0.010424,0.015807,0.104986,...,0.061549,0.076195,0.040770,0.021976,0.023411,0.005333,0.060091,0.040387,0.062475,0.010846
5998,0.048222,0.093003,0.025410,0.000000,0.002958,0.033527,0.008313,0.045916,0.041403,0.175901,...,0.085896,0.070035,0.019788,0.067433,0.079447,0.004014,0.056059,0.113348,0.081362,0.014683


In [216]:
dataset.loc[[0, 2, 1577]]

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fee,modality,duration,city,country,administration,url,descr_stem,descr_clean,fees_in_usd,descr_len
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,True,3D visualisation and animation play a role i...,September,\nFees\n\nPlease see the university website fo...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , 3d, visualis, and, anim, play, a, role, i...","[3d, visualis, anim, play, role, mani, area, p...",0.0,70
2,"Accounting, Accountability & Financial Manag...",King’s College London,King’s Business School,True,"Our Accounting, Accountability & Financial M...",September,\nFees\n\nPlease see the university website fo...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , our, accounting,, account, &, financi, ma...","[our, account, account, financi, manag, msc, c...",0.0,52
1577,Biotechnologies for food science,University of Padua,School of Agriculture and Veterinary Medicine,True,The Master’s degree in Biotechnologies for F...,October,\nFees\n\nOur tuition fees will not exceed 270...,MSc,2 years,Padua,Italy,On Campus,https://www.findamasters.com//masters-degrees/...,"[, , the, master’, degre, in, biotechnolog, fo...","[the, master, degre, biotechnolog, food, scien...",0.0,47


In [236]:
terms2 = terms[['term', 'reverse']].copy()

In [237]:
terms2['tfidf_tuples'] = terms2.term.apply(lambda term: [(doc_id, tfidf_data.loc[doc_id, term]) 
                                                       for doc_id in tfidf_data.index if tfidf_data.loc[doc_id, term] > 0])

In [238]:
terms2

Unnamed: 0,term,reverse,tfidf_tuples
0,3d,"[0, 432, 496, 581, 582, 880, 1829, 2430, 2828,...","[(0, 0.5690243042170251), (432, 0.139928218942..."
1,visualis,"[0, 66, 68, 387, 729, 1068, 1273, 1289, 1420, ...","[(0, 0.1219439779100013), (66, 0.1479389595175..."
2,anim,"[0, 7, 18, 25, 109, 156, 239, 605, 669, 959, 1...","[(0, 0.217866372992482), (7, 0.135780707366749..."
3,play,"[0, 14, 31, 68, 78, 180, 192, 267, 296, 309, 3...","[(0, 0.09443962008603238), (14, 0.110758812917..."
4,role,"[0, 14, 33, 59, 68, 70, 72, 78, 105, 111, 136,...","[(0, 0.06924429891703594), (14, 0.081209733185..."
...,...,...,...
8760,descriptionmateri,[5994],"[(5994, 0.21274134431702074)]"
8761,microelectronicspolym,[5994],"[(5994, 0.21274134431702074)]"
8762,sciencecatalysisnanotechnologythey,[5994],"[(5994, 0.21274134431702074)]"
8763,programmeyour,[5997],"[(5997, 0.26851586971682334)]"


In [239]:
terms2["reverse"]

0       [0, 432, 496, 581, 582, 880, 1829, 2430, 2828,...
1       [0, 66, 68, 387, 729, 1068, 1273, 1289, 1420, ...
2       [0, 7, 18, 25, 109, 156, 239, 605, 669, 959, 1...
3       [0, 14, 31, 68, 78, 180, 192, 267, 296, 309, 3...
4       [0, 14, 33, 59, 68, 70, 72, 78, 105, 111, 136,...
                              ...                        
8760                                               [5994]
8761                                               [5994]
8762                                               [5994]
8763                                               [5997]
8764                                               [5998]
Name: reverse, Length: 8765, dtype: object

In [240]:
terms2["tfidf_tuples"]

0       [(0, 0.5690243042170251), (432, 0.139928218942...
1       [(0, 0.1219439779100013), (66, 0.1479389595175...
2       [(0, 0.217866372992482), (7, 0.135780707366749...
3       [(0, 0.09443962008603238), (14, 0.110758812917...
4       [(0, 0.06924429891703594), (14, 0.081209733185...
                              ...                        
8760                        [(5994, 0.21274134431702074)]
8761                        [(5994, 0.21274134431702074)]
8762                        [(5994, 0.21274134431702074)]
8763                        [(5997, 0.26851586971682334)]
8764                         [(5998, 0.2882793102496349)]
Name: tfidf_tuples, Length: 8765, dtype: object

In [248]:
terms2.to_csv('inverted_index2.csv', index=False)

In [244]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def execute_query2(query, inverted_index, dataset, tfidf_data):
    # Preprocess the query to match the document processing
    query_terms = [stemmer.stem(word) for word in query.lower().split()] 
    
    # Find documents that contain all terms in the query
    relevant_docs = set()
    for term in query_terms:
        if term in inverted_index['term'].values:
            docs_with_term = [doc_id for doc_id, _ in inverted_index.loc[inverted_index['term'] == term, 'tfidf_tuples'].iloc[0]]
            if not relevant_docs:
                relevant_docs = set(docs_with_term)
            else:
                relevant_docs.intersection_update(docs_with_term)
    
    if not relevant_docs:
        return pd.DataFrame()  # If no documents are found, return an empty DataFrame
    
    # Calculate cosine similarity for each document
    query_vec = tfidf.transform([' '.join(query_terms)])
    similarities = {}
    for doc_id in relevant_docs:
        doc_vec = tfidf_data.loc[doc_id].values.reshape(1, -1)
        similarities[doc_id] = cosine_similarity(query_vec, doc_vec)[0][0]
    
    # Sort documents by their similarity score
    sorted_doc_ids = sorted(similarities, key=similarities.get, reverse=True)
    
    # Prepare the results with the similarity score
    results = dataset.loc[sorted_doc_ids, ['courseName', 'universityName', 'description', 'url']]
    results['similarity'] = [similarities[doc_id] for doc_id in sorted_doc_ids]
    
    return results.reset_index(drop=True)

In [246]:
# Example usage
results = execute_query2("advanced knowledge", terms2, dataset, tfidf_data)
(results)

Unnamed: 0,courseName,universityName,description,url,similarity
0,Geo-information Science and Earth Observatio...,University of Twente,INTERESTED IN A CAREER IN SPATIAL DATA SCIEN...,https://www.findamasters.com//masters-degrees/...,0.109635
1,Advanced Mechanical Engineering - MSc/PGDip,Heriot-Watt University,This programme aims to develop the knowledge...,https://www.findamasters.com//masters-degrees/...,0.065300
2,Gerontological Nursing - MSc/PgDip,Trinity College Dublin,The aim of this course is to strengthen and ...,https://www.findamasters.com//masters-degrees/...,0.047732
3,Ageing - MSc,Lancaster University,MSc Ageing aims to allow people working in a...,https://www.findamasters.com//masters-degrees/...,0.045696
4,Environmental Technology - MSc,Imperial College London,The MSc in Environmental Technology provides...,https://www.findamasters.com//masters-degrees/...,0.044767
...,...,...,...,...,...
463,Clinical Geriatrics - MSc,Cardiff University,Why study this courseThe MSc Clinical Geriat...,https://www.findamasters.com//masters-degrees/...,0.000000
464,Clinical Neuropsychology - MSc,University of Bristol,Professional programmes in Clinical Neuropsy...,https://www.findamasters.com//masters-degrees/...,0.000000
465,Clinical Ophthalmic Practice MSc,University College London,Register your interest in graduate study at ...,https://www.findamasters.com//masters-degrees/...,0.000000
466,Clinical Optometry - MSc,Cardiff University,Why study this courseThe aim of this program...,https://www.findamasters.com//masters-degrees/...,0.000000


In [247]:
# Example usage
results = execute_query2("financial management", terms2, dataset, tfidf_data)
(results)

Unnamed: 0,courseName,universityName,description,url,similarity
0,Master’s in Finance,Nova School of Business and Economics,The Master’s in Finance prepares students fo...,https://www.findamasters.com//masters-degrees/...,0.160558
1,Banking and Finance MSc,Università Cattolica del Sacro Cuore,The Banking and Finance programme is a speci...,https://www.findamasters.com//masters-degrees/...,0.112208
2,Master Financial Management and Control,The Hague University of Applied Sciences,Reasons for choosing this master programmeYo...,https://www.findamasters.com//masters-degrees/...,0.098208
3,"Finance, Accounting and Management - MSc",University of Bradford,A Master's degree designed to produce fully ...,https://www.findamasters.com//masters-degrees/...,0.094231
4,MSc Finance and Investment Management,University of Liverpool,A career in finance and investment managemen...,https://www.findamasters.com//masters-degrees/...,0.090280
...,...,...,...,...,...
237,Accounting - MSc,Bangor University,This degree programme provides the opportuni...,https://www.findamasters.com//masters-degrees/...,0.000000
238,Logistics and Supply Chain Management MSc (P...,University of Brighton,Our accredited Logistics and Supply Chain Ma...,https://www.findamasters.com//masters-degrees/...,0.000000
239,Accounting and Business Intelligence MSc,Brunel University London,Recent disruptive technological innovations ...,https://www.findamasters.com//masters-degrees/...,0.000000
240,Economics MSc,Swansea University,Economics Course OverviewDo you want a high ...,https://www.findamasters.com//masters-degrees/...,0.000000


# Q4

Reading the stored CSV files (inverted indexes)

In [None]:
#terms1 = pd.read_csv("inverted_index.csv")
#terms2 = pd.read_csv("inverted_index2.csv")
