In [1]:
import subprocess
import sys

# List of required packages
required_packages = ['requests', 'beautifulsoup4', 'pandas']

# Function to install packages if they are not already installed
def install_package(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install any missing packages
for package in required_packages:
    install_package(package)




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import requests
from bs4 import BeautifulSoup
import time
import os
import glob
import pandas as pd


In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
start_url = "https://guide.michelin.com/en/it/restauranttttts"
base_url = "https://guide.michelin.com"
next_page = start_url
link_list = []
while next_page:
    # Request page content
    response = requests.get(next_page,verify=False, headers=headers)
    soup = BeautifulSoup(response.content, features="lxml")
    # Find all restaurant links on the current page
    for link in soup.select("a.link"):
        href = link.get("href")
        if href and "/restaurant/" in href:
            link_list.append(base_url + href)
    # Look for the 'Next' button to proceed to the next page
    next_button = soup.find_all("a", class_="btn btn-outline-secondary btn-sm btn-carousel__link", href=True)
    if next_button:
        for content in next_button:
            if content.find("span", class_="icon fal fa-angle-right"):
                next_page = base_url+content["href"]
                break
            else:
                next_page = None

    else:
        next_page = None


# Display the collected links
print(f"Found {len(link_list)} restaurants:")
# Save to a text file
with open("urls.txt", "w") as file:
    for url in link_list:
        file.write(f"{url}\n")


In [None]:
# it is gonna take more 10 minutes
for index, link in enumerate(link_list):
    cnt = requests.get(link, verify=False, headers=headers)
    if cnt.status_code==200:
        html = BeautifulSoup(cnt.content, features="lxml")
        # Define the name of the subfolder and the filename
        subfolder = f"HTML/Page {str((index+20)//20)}"
        filename = f"{(link[link.rfind('/') + 1:]).replace('-', ' ')}.html"
        file_path = os.path.join(subfolder, filename)

        # Check if the subfolder exists, create it if it doesn't
        if not os.path.exists(subfolder):
            os.makedirs(subfolder)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(html.prettify())
    else:
        print("Request denied!")
        break

In [3]:

# Initialize an empty list to store the rows for DataFrame
data = []
# Define the base directory
base_directory = "HTML"
# Use glob to find all directories matching "Page*"
page_folders = glob.glob(os.path.join(base_directory, "Page *"))

# Loop through each Page* directory
for page_folder in page_folders:
    # Get all HTML files in the current Page* directory
    html_files = glob.glob(os.path.join(page_folder, "*.html"))

    # Read each HTML file
    for html_file in html_files:
        with open(html_file, "r", encoding='utf-8') as file:  # Ensure correct encoding
            content = BeautifulSoup(file.read(), "html.parser")
            # Extract the required information
            restaurantName = content.find("h1",class_="data-sheet__title").get_text().strip() if content.find("h1",class_="data-sheet__title") else ""

            basic_info_first_row_list=content.findAll("div",class_="data-sheet__block--text")[0].text
            basic_info_first_row_striped_list = [info.strip() for info in basic_info_first_row_list.split(",")]
            address = " ".join(basic_info_first_row_striped_list[:-3]) if basic_info_first_row_striped_list[:-3] else ""
            city = basic_info_first_row_striped_list[-3] if basic_info_first_row_striped_list[-3] else ""
            postal_code = basic_info_first_row_striped_list[-2]  if basic_info_first_row_striped_list[-2] else ""
            country = basic_info_first_row_striped_list[-1]  if basic_info_first_row_striped_list[-1] else ""


            basic_info_second_row_list=content.findAll("div",class_="data-sheet__block--text")[1].text
            basic_info_second_row_striped_list = [info.strip() for info in basic_info_second_row_list.split("·")]

            priceRange = basic_info_second_row_striped_list[0] if basic_info_second_row_striped_list[0] else ""
            cuisineType = basic_info_second_row_striped_list[1]  if basic_info_second_row_striped_list[1] else ""

            description = content.find("div",class_="data-sheet__description").get_text().strip() if content.find("div",class_="data-sheet__description") else ""

            facilitiesServices_div = content.findAll("div", class_="col col-12 col-lg-6")
            facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div[0].find_all("li")] if facilitiesServices_div[0] else ""

            div_creditCard = content.find("div", class_="restaurant-details__services--info")

            creditCards = [os.path.basename(img['data-src']).split('-')[0] for img in div_creditCard.find_all("img")] if div_creditCard else ""


            phoneNumber = content.find("span", attrs={"x-ms-format-detection": "none"}).get_text().strip() if content.find("span", attrs={"x-ms-format-detection": "none"}) else ""


            div_website = content.find("div", class_="collapse__block-item link-item")

            # Find the <a> tag within this container and get the href attribute
            a_website = div_website.find("a", class_="link js-dtm-link") if div_website else ""
            website = a_website.get("href") if a_website!="" else ""


            # Append the extracted info as a new row to the list
            data.append([restaurantName,address,city,postal_code,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website])


# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=["restaurantName","Address","City","Postal Code","Country","Price Range","Cuisine Type","Description","facilitiesServices","creditCards","phoneNumber","website"])

display(df)



# Iterate through each row in the DataFrame
for i, row in df.iterrows():
    # Define the file name using the index
    file_name = f"restaurant_{i}.tsv"

    # Prepare row data as a single line with tab-separated values
    content =  f"{row['restaurantName']}\t{row['Address']}\t{row['City']}\t{row['Postal Code']}\t{row['Country']}\t{row['Price Range']}\t{row['Cuisine Type']}\t{row['Description']}\t{row['facilitiesServices']}\t{row['creditCards']}\t{row['phoneNumber']}\t{row['website']}\n"

    subfolder = f"tsv_files"
    file_path = os.path.join(subfolder, file_name)

    # Check if the subfolder exists, create it if it doesn't
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
    # Write the row data to the .tsv file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

    print(f"Created file: {file_name}")

Unnamed: 0,restaurantName,Address,City,Postal Code,Country,Price Range,Cuisine Type,Description,facilitiesServices,creditCards,phoneNumber,website
0,Roscioli,via dei Giubbonari 21,Rome,00186,Italy,€€,"Roman, Italian",This restaurant is part of one of the best foo...,"[Air conditioning, Interesting wine list]","[amex, dinersclub, mastercard, visa]",+39 06 687 5287,https://www.salumeriaroscioli.com/
1,Trattoria da Zamboni,via Santa Croce 73,Lapio,36057,Italy,€€,"Classic Cuisine, Italian Contemporary",Famous country trattoria: interiors in modern ...,"[Air conditioning, Car park, Great view, Inter...","[amex, dinersclub, mastercard, visa]",+39 0444 273079,https://www.trattoriazamboni.it/
2,ConTatto,via Gioberti 11,Frascati,00044,Italy,€€,Cuisine from Lazio,Situated in a small town just a few kilometres...,[Air conditioning],"[amex, dinersclub, mastercard, visa]",+39 06 2170 0957,http://www.contattoristorante.it
3,Emozioni,via Guglielmo Marconi 129,Campobasso,86100,Italy,€€,Contemporary,"Situated in the heart of the historic centre, ...",[Air conditioning],"[dinersclub, mastercard, visa]",+39 328 875 1903,http://www.ristoranteemozioni.com
4,Antica Osteria dei Camelì,via Marconi 13,Ambivere,24030,Italy,€€€,Modern Cuisine,Occupying an attractive farmhouse dating back ...,"[Air conditioning, Car park, Interesting wine ...","[amex, dinersclub, mastercard, visa]",+39 035 908000,https://www.anticaosteriadeicameli.it/
...,...,...,...,...,...,...,...,...,...,...,...,...
1977,La Villa,Contrada Cavallerizza SS 303 verso Rocchetta S...,Melfi,85025,Italy,€,"Cuisine from Basilicata, Traditional Cuisine",This country restaurant owes its welcoming atm...,"[Air conditioning, Car park]","[dinersclub, mastercard, visa]",+39 0972 236008,https://www.lavillamelfi.it/
1978,Lazaroun,via Del Platano 21,Santarcangelo di Romagna,47822,Italy,€€,"Cuisine from Romagna, Traditional Cuisine","The prototype of a typical Romagna restaurant,...","[Air conditioning, Terrace]","[amex, maestrocard, mastercard, visa]",+39 0541 624417,http://www.lazaroun.it
1979,Fracia,località Fracia,Teglio,23036,Italy,€,"Cuisine from Valtellina, Traditional Cuisine",Park the car and walk up a short track (about ...,[Terrace],"[amex, dinersclub, mastercard, visa]",+39 0342 482671,https://www.ristorantefracia.it/
1980,Casa Perbellini 12 Apostoli,vicolo Corticella San Marco 3,Verona,37121,Italy,€€€€,"Creative, Contemporary",Giancarlo Perbellini returns to his origins in...,"[Air conditioning, Counter dining, Interesting...","[amex, maestrocard, mastercard, visa]",+39 045 878 0860,http://www.casaperbellini.com


Created file: restaurant_0.tsv
Created file: restaurant_1.tsv
Created file: restaurant_2.tsv
Created file: restaurant_3.tsv
Created file: restaurant_4.tsv
Created file: restaurant_5.tsv
Created file: restaurant_6.tsv
Created file: restaurant_7.tsv
Created file: restaurant_8.tsv
Created file: restaurant_9.tsv
Created file: restaurant_10.tsv
Created file: restaurant_11.tsv
Created file: restaurant_12.tsv
Created file: restaurant_13.tsv
Created file: restaurant_14.tsv
Created file: restaurant_15.tsv
Created file: restaurant_16.tsv
Created file: restaurant_17.tsv
Created file: restaurant_18.tsv
Created file: restaurant_19.tsv
Created file: restaurant_20.tsv
Created file: restaurant_21.tsv
Created file: restaurant_22.tsv
Created file: restaurant_23.tsv
Created file: restaurant_24.tsv
Created file: restaurant_25.tsv
Created file: restaurant_26.tsv
Created file: restaurant_27.tsv
Created file: restaurant_28.tsv
Created file: restaurant_29.tsv
Created file: restaurant_30.tsv
Created file: rest

In [4]:
df.head()

Unnamed: 0,restaurantName,Address,City,Postal Code,Country,Price Range,Cuisine Type,Description,facilitiesServices,creditCards,phoneNumber,website
0,Roscioli,via dei Giubbonari 21,Rome,186,Italy,€€,"Roman, Italian",This restaurant is part of one of the best foo...,"[Air conditioning, Interesting wine list]","[amex, dinersclub, mastercard, visa]",+39 06 687 5287,https://www.salumeriaroscioli.com/
1,Trattoria da Zamboni,via Santa Croce 73,Lapio,36057,Italy,€€,"Classic Cuisine, Italian Contemporary",Famous country trattoria: interiors in modern ...,"[Air conditioning, Car park, Great view, Inter...","[amex, dinersclub, mastercard, visa]",+39 0444 273079,https://www.trattoriazamboni.it/
2,ConTatto,via Gioberti 11,Frascati,44,Italy,€€,Cuisine from Lazio,Situated in a small town just a few kilometres...,[Air conditioning],"[amex, dinersclub, mastercard, visa]",+39 06 2170 0957,http://www.contattoristorante.it
3,Emozioni,via Guglielmo Marconi 129,Campobasso,86100,Italy,€€,Contemporary,"Situated in the heart of the historic centre, ...",[Air conditioning],"[dinersclub, mastercard, visa]",+39 328 875 1903,http://www.ristoranteemozioni.com
4,Antica Osteria dei Camelì,via Marconi 13,Ambivere,24030,Italy,€€€,Modern Cuisine,Occupying an attractive farmhouse dating back ...,"[Air conditioning, Car park, Interesting wine ...","[amex, dinersclub, mastercard, visa]",+39 035 908000,https://www.anticaosteriadeicameli.it/


## ***2.0 Pre-processing:***

In [5]:
!pip install nltk

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
from nltk.corpus import words

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_words = set(words.words())  

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text)
    words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words and len(word) > 2 and word in english_words
    ]
    return words

In [7]:
df['Description'] = df['Description'].str.lower()
df['processed_description'] = df['Description'].apply(preprocess_text)
df['processed_description']

0       [restaurant, part, one, best, food, typical, r...
1       [famous, country, modern, style, view, large, ...
2       [situated, small, town, capital, restaurant, r...
3       [situated, heart, historic, quiet, restaurant,...
4       [attractive, farmhouse, dating, back, restaura...
                              ...                        
1977    [country, restaurant, welcoming, atmosphere, s...
1978    [prototype, typical, restaurant, run, efficien...
1979    [park, car, walk, short, track, rustic, stone,...
1980    [historic, restaurant, native, city, every, ch...
1981    [sea, table, restaurant, offer, sea, whether, ...
Name: processed_description, Length: 1982, dtype: object

In [8]:
df.head()

Unnamed: 0,restaurantName,Address,City,Postal Code,Country,Price Range,Cuisine Type,Description,facilitiesServices,creditCards,phoneNumber,website,processed_description
0,Roscioli,via dei Giubbonari 21,Rome,186,Italy,€€,"Roman, Italian",this restaurant is part of one of the best foo...,"[Air conditioning, Interesting wine list]","[amex, dinersclub, mastercard, visa]",+39 06 687 5287,https://www.salumeriaroscioli.com/,"[restaurant, part, one, best, food, typical, r..."
1,Trattoria da Zamboni,via Santa Croce 73,Lapio,36057,Italy,€€,"Classic Cuisine, Italian Contemporary",famous country trattoria: interiors in modern ...,"[Air conditioning, Car park, Great view, Inter...","[amex, dinersclub, mastercard, visa]",+39 0444 273079,https://www.trattoriazamboni.it/,"[famous, country, modern, style, view, large, ..."
2,ConTatto,via Gioberti 11,Frascati,44,Italy,€€,Cuisine from Lazio,situated in a small town just a few kilometres...,[Air conditioning],"[amex, dinersclub, mastercard, visa]",+39 06 2170 0957,http://www.contattoristorante.it,"[situated, small, town, capital, restaurant, r..."
3,Emozioni,via Guglielmo Marconi 129,Campobasso,86100,Italy,€€,Contemporary,"situated in the heart of the historic centre, ...",[Air conditioning],"[dinersclub, mastercard, visa]",+39 328 875 1903,http://www.ristoranteemozioni.com,"[situated, heart, historic, quiet, restaurant,..."
4,Antica Osteria dei Camelì,via Marconi 13,Ambivere,24030,Italy,€€€,Modern Cuisine,occupying an attractive farmhouse dating back ...,"[Air conditioning, Car park, Interesting wine ...","[amex, dinersclub, mastercard, visa]",+39 035 908000,https://www.anticaosteriadeicameli.it/,"[attractive, farmhouse, dating, back, restaura..."


## ***2.1 Conjuctive Query:***

In [9]:
import pandas as pd

vocab_dict = {}
term_id = 0

all_uniqe_words = set(word for description in df["processed_description"] for word in description)

for word in all_uniqe_words:
    vocab_dict[word] = term_id
    term_id+=1

vocab_dict

{'property': 0,
 'pheasant': 1,
 'theatrical': 2,
 'scorpion': 3,
 'octopus': 4,
 'life': 5,
 'informality': 6,
 'orient': 7,
 'cookery': 8,
 'give': 9,
 'concept': 10,
 'valley': 11,
 'convey': 12,
 'cousin': 13,
 'altitude': 14,
 'dominate': 15,
 'advice': 16,
 'fast': 17,
 'chat': 18,
 'formerly': 19,
 'anything': 20,
 'renovation': 21,
 'hidden': 22,
 'struck': 23,
 'starter': 24,
 'walk': 25,
 'boredom': 26,
 'sweet': 27,
 'obsession': 28,
 'sizzling': 29,
 'chapter': 30,
 'radius': 31,
 'lee': 32,
 'yet': 33,
 'knowledge': 34,
 'five': 35,
 'soon': 36,
 'delighting': 37,
 'quantity': 38,
 'touch': 39,
 'costume': 40,
 'far': 41,
 'overseen': 42,
 'already': 43,
 'integrity': 44,
 'trying': 45,
 'excelsior': 46,
 'neat': 47,
 'passionate': 48,
 'native': 49,
 'count': 50,
 'regal': 51,
 'impressive': 52,
 'persuade': 53,
 'ostentatious': 54,
 'chairman': 55,
 'wooded': 56,
 'convent': 57,
 'papal': 58,
 'colline': 59,
 'microcosm': 60,
 'picture': 61,
 'inviting': 62,
 'wrong': 63

In [10]:
vocab_df = pd.DataFrame(list(vocab_dict.items()), columns=['term', 'term_id'])
vocab_df.to_csv('vocabulary.csv', index=False)

vocab_df.head()

Unnamed: 0,term,term_id
0,property,0
1,pheasant,1
2,theatrical,2
3,scorpion,3
4,octopus,4


In [11]:
from collections import defaultdict
import json

inverted_idx = defaultdict(list)

for idx, description in enumerate(df['processed_description']):
    for word in description:
        term_id = vocab_dict[word]

        if idx not in inverted_idx[term_id]:
            inverted_idx[term_id].append(idx)


with open('inverted_index.json', 'w') as f:
    json.dump(inverted_idx, f)


In [12]:
def process_query(query, vocab_dict, inverted_index, df):
    query_words = preprocess_text(query)

    doc_sets = []
    for word in query_words:
        term_id = vocab_dict.get(word)
        if term_id is not None:
            doc_sets.append(set(inverted_index.get(term_id, [])))

    if doc_sets:
        result_docs = set.intersection(*doc_sets)
    else:
        result_docs = set()

    results = df.loc[result_docs, ["restaurantName", "Address", "Description", "website"]]
    return results


In [13]:
query = "modern seasonal cuisine"
results = process_query(query, vocab_dict, inverted_idx, df)
results

  results = df.loc[result_docs, ["restaurantName", "Address", "Description", "website"]]


Unnamed: 0,restaurantName,Address,Description,website
780,Razzo,via Andrea Doria 17/f,"a quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/
533,Zum Löwen,via Tirolo 25,a charming restaurant housed in a skilfully re...,
407,San Michele,via Castello di Fagagna 33,situated next to the ruins of the old castle a...,http://sanmichele.restaurant
920,Ronchi Rò,località Cime di Dolegna 12,ronchi rò is an estate-cum-agriturismo surroun...,https://www.ronchiro.it
1561,Piccolo Lord,corso San Maurizio 69 bis/g,"professional service in a welcoming, modern re...",https://www.ristorantepiccololord.it/
922,Le Vie del Borgo,via alla Piazza 6,le vie del borgo is situated in a restored rus...,https://www.leviedelborgoguesthouse.it/
1571,Il Tino,via Monte Cadria 127,enjoying an attractive location in the nautilu...,https://www.ristoranteiltino.com/
167,Ca' Del Moro,località Erbin 31,situated within the la collina dei ciliegi win...,https://www.cadelmoro.wine/it
1074,Chichibio,via Guglielmo Marconi 1,"despite its lack of awards, this restaurant st...",
53,RistoFante,via Mazzini 41,the motto of this restaurant is “in step with ...,https://www.ristofante.it/


## ***2.2 Ranked Search Engine with TF-IDF and Cosine Similarity:***

BY ME


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_tfidf_inverted_index(df):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['processed_description'].apply(lambda x: ' '.join(x)))

    terms = vectorizer.get_feature_names_out()
    term_to_id = {term: idx for idx, term in enumerate(terms)}
    inverted_index = defaultdict(list)

    for doc_id in range(tfidf_matrix.shape[0]):
        for term_id in tfidf_matrix[doc_id].nonzero()[1]:
            tfidf_score = tfidf_matrix[doc_id, term_id]
            inverted_index[term_id].append((doc_id, tfidf_score))

    return inverted_index, term_to_id, tfidf_matrix, vectorizer


In [15]:
inverted_index, term_to_id, tfidf_matrix, vectorizer = build_tfidf_inverted_index(df)

readable_inverted_index = {
    term: [(doc_id, round(tfidf_score, 3)) for doc_id, tfidf_score in inverted_index[term_id]]
    for term, term_id in term_to_id.items()
}

for term, doc_scores in readable_inverted_index.items():
    print(f"{term}: {doc_scores}")


abate: [(871, 0.214)]
abbey: [(341, 0.368), (992, 0.264), (1073, 0.335), (1867, 0.203)]
abbreviation: [(736, 0.466)]
ability: [(514, 0.145), (718, 0.2), (744, 0.194), (840, 0.181)]
able: [(1586, 0.249), (1785, 0.235), (1789, 0.269), (1849, 0.192)]
ably: [(120, 0.245), (122, 0.185), (228, 0.19), (362, 0.148), (590, 0.156), (677, 0.196), (726, 0.167), (744, 0.161), (842, 0.152), (1011, 0.148), (1084, 0.199), (1243, 0.186), (1262, 0.129), (1516, 0.152), (1715, 0.159)]
abroad: [(284, 0.375), (313, 0.21), (328, 0.207), (699, 0.24), (764, 0.25), (965, 0.159), (1141, 0.37), (1149, 0.133), (1155, 0.168), (1238, 0.256), (1304, 0.323), (1506, 0.178), (1571, 0.152), (1747, 0.188), (1761, 0.24)]
absinthe: [(912, 0.214)]
absolute: [(526, 0.173), (1196, 0.284)]
absolutely: [(296, 0.236), (1175, 0.265), (1653, 0.27)]
abundance: [(1122, 0.242), (1324, 0.343), (1616, 0.179)]
abundant: [(525, 0.288), (531, 0.271), (912, 0.17), (1063, 0.246), (1159, 0.285), (1288, 0.25), (1339, 0.176), (1598, 0.237), (18

In [16]:
import numpy as np

def search_query(query, tfidf_matrix, vectorizer, df, top_k=5):
    query_tfidf = vectorizer.transform([query])

    cosine_similarities = (tfidf_matrix @ query_tfidf.T).toarray().flatten()

    relevant_docs = np.argsort(-cosine_similarities)[:top_k]

    results = df.loc[relevant_docs, ["restaurantName", "Address", "Description", "website"]]
    results["Similarity Score"] = cosine_similarities[relevant_docs]

    return results


***TEST***

In [17]:
inverted_index, term_to_id, tfidf_matrix, vectorizer = build_tfidf_inverted_index(df)

query = "modern seasonal cuisine"
results = search_query(query, tfidf_matrix, vectorizer, df)

results

Unnamed: 0,restaurantName,Address,Description,website,Similarity Score
500,Saur,via Filippo Turati 8,"in a tiny rural village, this contemporary, al...",https://ristorantesaur.it,0.344814
545,20Tre,via David Chiossone 20 r,situated in the heart of genoa’s historic cent...,https://www.ristorante20tregenova.it/,0.336516
1110,La Botte,via Giuseppe Garibaldi 8,a modern and welcoming contemporary bistro sit...,http://www.trattorialabottestresa.it,0.31609
1486,Retrobottega,via della Stelletta 4,minimalist decor and clean lines characterise ...,https://www.retro-bottega.com,0.297514
780,Razzo,via Andrea Doria 17/f,"a quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/,0.28403


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def search_query_dscore(query, tfidf_matrix, vectorizer, df, top_k=5):
    query_tfidf = vectorizer.transform([query])

    cosine_similarities = (tfidf_matrix @ query_tfidf.T).toarray().flatten()

    return cosine_similarities

def calculate_score(doc, query, vectorizer, tfidf_matrix, cuisine_preferences, facility_preferences, price_preferences):
    score = 0

    # TF-IDF vector for the query
    query_tfidf = vectorizer.transform([query])


    doc_index = doc.name  # Index of the document in the dataframe
    doc_vec = tfidf_matrix[doc_index]

    #description score
    description_score = cosine_similarity(query_tfidf, doc_vec)[0, 0]  # cosine similarity
    score += description_score


    #cuisine type score
    for cuisine in cuisine_preferences:
        if cuisine.lower() in doc['Cuisine Type'].lower():
            score += 0.2

    #facilities score
    for facility in facility_preferences:
        if facility.lower() in [f.lower() for f in doc['facilitiesServices']]:
            score += 0.1

    #price range score
    for price in price_preferences:
        if price in doc['Price Range']:
            score += 0.2

    return score




In [20]:
import heapq

def ranked_restaurants(query, tfidf_matrix, vectorizer, df, top_k=5, cuisine_preferences=None, facility_preferences=None, price_preferences=None):
    # create an heap
    heap = []

    for doc_id, doc in df.iterrows():  # for each restaurant
        # personalized score
        score = calculate_score(
            doc,  # current restourant
            query,
            vectorizer,
            tfidf_matrix,
            cuisine_preferences or [],
            facility_preferences or [],
            price_preferences or []
        )

        # top k results in the heat
        if len(heap) < top_k:
            heapq.heappush(heap, (score, doc_id))  # Adding an element (score, ID doc)
        else:
            heapq.heappushpop(heap, (score, doc_id))  # Replace the smaller item if necessary

    # Sort the heap to get the results in descending order
    ranked_results = sorted(heap, key=lambda x: x[0], reverse=True)

    # Format the final results
    results = []
    for score, doc_id in ranked_results:
        row = df.iloc[doc_id]
        results.append({
            "restaurantName": row["restaurantName"],
            "Address": row["Address"],
            "Description": row["Description"],
            "website": row["website"],
            "custom_score": round(score, 3)
        })

    results_df2 = pd.DataFrame(results)
    return results_df2


In [21]:
from IPython.display import display

query = "modern seasonal cuisine"
cuisine_preferences = ["Italian", "French"]
facility_preferences = ["Terrace", "Air conditioning"]
price_preferences = ["$$", "$$$"]
top_k = 5

results_df = ranked_restaurants(query, tfidf_matrix, vectorizer, df, top_k, cuisine_preferences, facility_preferences, price_preferences)

# Use display for a tabular view
display(results_df)



Unnamed: 0,restaurantName,Address,Description,website,custom_score
0,Saur,via Filippo Turati 8,"in a tiny rural village, this contemporary, al...",https://ristorantesaur.it,0.745
1,Locanda Solagna,piazza I Novembre 2,although this restaurant has been in business ...,https://www.locandasolagna.it/,0.637
2,Miramonti l'Altro,via Crosette 34 località Costorio,a french-italian couple is at the helm in this...,https://www.miramontilaltro.it/,0.619
3,Il Galeone,piazzale Amendola 2,housed on the ground floor of the elisabeth du...,https://www.ilgaleone.net/,0.604
4,Savô,piazza XXV Aprile 8,the reopening in 2022 of the hotel windsor wit...,http://www.thewindsor.it,0.598


**The new scoring function improves the results because it incorporates additional variables such as the type of cuisine, the services available and the price range. This allows us to obtain results that are more relevant to the user's preferences, which would otherwise have been ignored in the original scoring function. For example, a restaurant that meets preferences in terms of cuisine and price, but has a less detailed description, is now considered more relevant.**

### Q4 1.1 ###

In [46]:
import pandas as pd
from geopy.geocoders import Nominatim
import time

geolocator = Nominatim(user_agent="restaurant_locator")

def get_coordinates(full_address):
    try:
        location = geolocator.geocode(full_address)
        if location:
            return location.latitude, location.longitude
        else:
            #print(f"Coordinates not found for: {full_address}")
            return None, None
    except Exception as e:
        print(f"Error fetching coordinates for {full_address}: {e}")
        return None, None


df['full_address'] =  df['restaurantName'] + ' ' + df['Country']



In [49]:
import os

main_directory = 'HTML/'  

data_list = []

for root, dirs, files in os.walk(main_directory):
    for filename in files:
        if filename.endswith('.html'):
            filepath = os.path.join(root, filename)
            
            with open(filepath, 'r', encoding='utf-8') as file:
                html_content = file.read()
            
            soup = BeautifulSoup(html_content, 'html.parser')
            json_ld_script = soup.find('script', type='application/ld+json')
            
            if json_ld_script:
                data = json.loads(json_ld_script.string)
                
                name = data.get("name")
                latitude = data.get("latitude")
                longitude = data.get("longitude")
                address_region = data["address"].get("addressRegion") if data.get("address") else None
                
                data_list.append({
                    "Name": name,
                    "Latitude": latitude,
                    "Longitude": longitude,
                    "AddressRegion": address_region
                })
                print(f"{name} saved.")
location_df = pd.DataFrame(data_list)

location_df.to_csv('location_data.csv', index=False)

print("location_data.csv saved.")


Roscioli kaydedildi
Trattoria da Zamboni kaydedildi
ConTatto kaydedildi
Emozioni kaydedildi
Antica Osteria dei Camelì kaydedildi
Casa Fantini/Lake Time kaydedildi
Sale Grosso kaydedildi
Osteria della Foce kaydedildi
Osteria Mondo d'Oro kaydedildi
Konnubio kaydedildi
Il Grano di Pepe kaydedildi
Angiò-Macelleria di Mare kaydedildi
Le Colonne kaydedildi
Osteria degli Angeli kaydedildi
Trattoria del Cimino dal 1895 kaydedildi
Novo Osteria kaydedildi
Trattoria di Via Serra kaydedildi
Aqua kaydedildi
Acqua Pazza kaydedildi
La Taverna di Bacco kaydedildi
Local kaydedildi
La Locanda di Fabio e Vale kaydedildi
Le Nove Scodelle kaydedildi
Al Gatto Verde kaydedildi
Ad Astra kaydedildi
Andree kaydedildi
Terramira kaydedildi
Zest kaydedildi
L'Acquario kaydedildi
Caracol kaydedildi
San Giorgio kaydedildi
Contrada Bricconi kaydedildi
Octavin kaydedildi
Osteria dell'Arco kaydedildi
Azotea kaydedildi
Nole kaydedildi
Leon d'Oro kaydedildi
Terrazza Tiberio kaydedildi
Casa Coloni kaydedildi
Tre Scalini ka

In [50]:
location_df.head()

Unnamed: 0,Name,Latitude,Longitude,AddressRegion
0,Roscioli,41.894249,12.474233,Lazio
1,Trattoria da Zamboni,45.47452,11.5357,Veneto
2,ConTatto,41.808042,12.682078,Lazio
3,Emozioni,41.56047,14.657181,Molise
4,Antica Osteria dei Camelì,45.719514,9.547377,Lombardy
