
## **1. Data Collection**
### **1.1. Get the list of Michelin restaurants**

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from IPython.display import display

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
} # user agent is used to simulate that the http request comes from a real web browser, this prevent the server from blocking requests

def guide_michelin(): # 2037
        links = []
        for i in range(1,101): #100
            link = "https://guide.michelin.com/en/it/restaurants/page/{}".format(i)
            try:
                response = requests.get(link, headers=headers)
            except Exception as e:
                print(f"{e} \n {link}")
                continue
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                section = soup.find('div', class_="row restaurant__list-row js-restaurant__list_items")
                if section:  
                    for a_tag in section.find_all('a', href=True):
                        href = 'https://guide.michelin.com' + a_tag['href']
                        if href not in links and "/restaurant/" in href: 
                            links.append(href)
            else:
                print(f"Failed to retrieve page {i}")    
        return links

url_set = guide_michelin()
print(len(url_set))

Exception ignored in: <finalize object at 0x127109c20; dead>
Traceback (most recent call last):
  File "/Users/camillalabbate/anaconda3/lib/python3.11/weakref.py", line 590, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/camillalabbate/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1127, in _close_pool_connections
    while True:
KeyboardInterrupt: 


KeyboardInterrupt: 

In [3]:
with open('links.txt', 'w') as f:
    for url in url_set:
        f.write(url + '\n')

### **1.2. Crawl Michelin restaurant pages**

In [4]:
if not os.path.exists('pages'):
    os.makedirs('pages')

with open('links.txt', 'r') as f:
    urls = f.read().splitlines()

# Create directories and save HTML documents
for index, url in enumerate(urls):
    page_number = index // 20 + 1
    directory = os.path.join('pages', f'page_{page_number}')
    if not os.path.exists(directory):
        os.makedirs(directory)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            file_path = os.path.join(directory, f'document_{index}.html')
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
        else:
            print(f"Failed to retrieve {url}")
    except Exception as e:
        print(f"Error fetching {url}: {e}")

print("HTML documents saved successfully.")

In [5]:
dir_paths = [os.path.join('pages', dir) for dir in os.listdir('pages')]
len(dir_paths)

100

### **1.3. Parse downloaded pages**

In [3]:
# Function to extract restaurant details from HTML content
def extract_restaurant_details(content):
    
    # Extract the restaurant name
    name = content.find('h1', class_='data-sheet__title').get_text(strip=True) if content.find('h1', class_='data-sheet__title') else ""
    
    # Extract the first row of basic information
    firstRow = content.find_all("div", class_="data-sheet__block--text")[0].get_text(strip=True)
    #firstRow = content.find("div", class_="data-sheet__block--text").get_text(strip=True)
    firstRow_list = [info.strip() for info in firstRow.split(",")]

    address = " ".join(firstRow_list[:-3]) if len(firstRow_list) > 3 else ""
    city = firstRow_list[-3] if len(firstRow_list) > 2 else ""
    postalCode = firstRow_list[-2] if len(firstRow_list) > 1 else ""
    country = firstRow_list[-1] if firstRow_list else ""

    # Extract the second row of basic information
    secondRow = content.find_all("div", class_="data-sheet__block--text")[1].get_text(strip=True)
    #secondRow = content.find("div", class_="data-sheet__block--text").get_text(strip=True)
    secondRow_list = [info.strip() for info in secondRow.split("·")]

    priceRange = secondRow_list[0] if secondRow_list else ""
    cuisineType = secondRow_list[1] if len(secondRow_list) > 1 else ""

    # Extract the description
    description = content.find("div", class_="data-sheet__description").get_text(strip=True) if content.find("div", class_="data-sheet__description") else ""

    # Extract facilities and services
    facilitiesServices_div = content.find_all("div", class_="col col-12 col-lg-6")
    # facilitiesServices_div = content.find("div", class_="col col-12 col-lg-6")
    facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div[0].find_all("li")] if facilitiesServices_div else []
    # facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div.find("li")] if facilitiesServices_div else []

    # Extract credit card information
    creditCards_div = content.find("div", class_="restaurant-details__services--info")
    creditCards = [os.path.basename(img["data-src"]).split("-")[0] for img in creditCards_div.find_all("img")] if creditCards_div else []

    # Extract phone number
    phoneNumber = content.find("span", attrs={"x-ms-format-detection": "none"}).get_text(strip=True) if content.find("span", attrs={"x-ms-format-detection": "none"}) else ""

    # Extract website
    website_div = content.find("div", class_="collapse__block-item link-item")
    website = website_div.find("a", class_="link js-dtm-link")["href"] if website_div and website_div.find("a", class_="link js-dtm-link") else ""

    # Return the extracted data as a dictionary
    return {
        "restaurantName": name,
        "address": address,
        "city": city,
        "postalCode": postalCode,
        "country": country,
        "priceRange": priceRange,
        "cuisineType": cuisineType,
        "description": description,
        "facilitiesServices": facilitiesServices,
        "creditCards": creditCards,
        "phoneNumber": phoneNumber,
        "website": website
    }

# Collecting data from all HTML files
#folder_paths = [d for d in os.listdir('pages') if os.path.isdir(d) and d.startswith("page_")]
dir_paths = [os.path.join('pages', dir) for dir in os.listdir('pages')]

data = []
for dir in dir_paths:
    for html_file in os.listdir(dir):
        if html_file.endswith(".html"):
            with open(os.path.join(dir, html_file), "r", encoding="utf-8") as file:
                soup = BeautifulSoup(file, "html.parser")
                restaurant_details = extract_restaurant_details(soup)
                data.append(restaurant_details)

# Create a DataFrame from the data list
df = pd.DataFrame(data)

df.columns = ["restaurantName", "address", "city", "postalCode", "country", "priceRange", "cuisineType", "description", "facilitiesServices", "creditCards", "phoneNumber", "website"]


In [20]:
# Display the DataFrame
display(df)

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,San Lorenzo,piazza Sordini 6,Spoleto,06049,Italy,€€,"Seafood, Traditional Cuisine","Situated within the Clitunno hotel, this well-...","[Air conditioning, Terrace, Wheelchair access]","[amex, mastercard, visa]",+39 0743 223340,https://www.ristorantesanlorenzo.com/it/
1,Dalla Libera,via Farra 52,Sernaglia della Battaglia,31020,Italy,€,"Country cooking, Seasonal Cuisine","At this restaurant, an American-style barbecue...","[Car park, Interesting wine list, Terrace]","[amex, mastercard, visa]",+39 0438 966295,http://www.trattoriadallalibera.it
2,La Notizia 53,via Caravaggio 53/55,Naples,80126,Italy,€,Pizza,Although situated away from the tourist centre...,[Air conditioning],"[amex, maestrocard, mastercard, visa]",+39 081 714 2155,http://www.pizzarialanotizia.com
3,Madonnina del Pescatore,via Lungomare Italia 11,Marzocca,60019,Italy,€€€€,"Creative, Contemporary",It’s now forty years since the Madonnina opene...,"[Air conditioning, Great view, Interesting win...","[amex, mastercard, visa]",+39 071 698267,https://www.morenocedroni.it/
4,Da Vincenzo,viale Pasitea 172/178,Positano,84017,Italy,€€,"Campanian, Traditional Cuisine",It's never easy to find the right balance betw...,"[Air conditioning, Terrace]","[amex, mastercard, visa]",+39 089 875128,https://www.davincenzo.it/
...,...,...,...,...,...,...,...,...,...,...,...,...
1978,Foresta,via Litoranea 2,Marina di Pisa,56128,Italy,€€€,"Seafood, Classic Cuisine","Overlooking the Tyrrhenian Sea, all the tables...","[Air conditioning, Great view, Terrace, Wheelc...","[amex, mastercard, visa]",+39 050 35082,https://www.ristoranteforesta.com/
1979,Oberraindlhof,Raindl 49,Madonna di Senales,39020,Italy,€€,"Traditional Cuisine, Regional Cuisine",The family that run this restaurant occupying ...,"[Car park, Great view, Interesting wine list, ...","[amex, mastercard, visa]",+39 0473 679131,https://www.oberraindlhof.com/it/buongustaio
1980,Antica Trattoria Gianna,via Maggiore 12,Recorfano,26030,Italy,€,"Lombardian, Country cooking","A delightful family trattoria offering simple,...","[Air conditioning, Terrace]","[amex, dinersclub, mastercard, visa]",+39 0375 98351,
1981,Cavallino,corso Romita 83,Tortona,15057,Italy,€€€,Modern Cuisine,Three talented young entrepreneurs who are pas...,"[Air conditioning, Car park, Interesting wine ...","[amex, mastercard, visa]",+39 0131 862308,http://www.cavallino-tortona.it


# 2  Search Engine

### 2.0 Preprocessing the Text

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camillalabbate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords, then apply stemming
    tokens = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply to the description field
df['processed_description'] = df['description'].apply(preprocess_text)


### 2.1 Conjunctive Query

### 2.1.1 Create the Index!

In [6]:
from collections import defaultdict
import pandas as pd

vocabulary = {}
inverted_index = defaultdict(list)
term_id_counter = 0

for doc_id, description in enumerate(df['processed_description']):
    for word in description.split():
        # Map each unique word to a term_id
        if word not in vocabulary:
            vocabulary[word] = term_id_counter
            term_id_counter += 1
        term_id = vocabulary[word]
        inverted_index[term_id].append(doc_id)

# Save the vocabulary to a CSV file
pd.DataFrame(list(vocabulary.items()), columns=['term', 'term_id']).to_csv('vocabulary.csv', index=False)


In [7]:
import json

with open('inverted_index.json', 'w') as f:
    json.dump(inverted_index, f)


### 2.1.2 Execute the Query

In [28]:
def preprocess_query(query):
    query = query.lower()
    query = query.translate(str.maketrans('', '', string.punctuation))
    tokens = [stemmer.stem(word) for word in query.split() if word not in stop_words]
    return tokens

def conjunctive_query(query):
    query_terms = preprocess_query(query)
    term_ids = [vocabulary.get(term) for term in query_terms if term in vocabulary]

    if not term_ids:
        return pd.DataFrame(columns=["restaurantName", "address", "description", "website"])

    # Start with the document list for the first term, then intersect with others
    matching_docs = set(inverted_index[term_ids[0]])
    for term_id in term_ids[1:]:
        matching_docs &= set(inverted_index[term_id])

    results = df.loc[list(matching_docs), ["restaurantName", "address", "description", "website"]]
    return results



In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_description'])


In [None]:
tfidf_index = defaultdict(list)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Loop over each term (feature) in the TF-IDF matrix
for term_id, term in enumerate(feature_names):
    # Get non-zero document indices and the corresponding scores for this term
    doc_indices = tfidf_matrix[:, term_id].nonzero()[0]
    scores = tfidf_matrix[:, term_id].data
    
    # Append each document ID and score to the tfidf_index dictionary 
    for doc_id, score in zip(doc_indices, scores):
        tfidf_index[term].append((doc_id, score))



### 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def ranked_query(query, top_k=5):
    query_vec = tfidf_vectorizer.transform([preprocess_text(query)])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_doc_indices = cosine_similarities.argsort()[-top_k:][::-1]

    results = df.loc[top_doc_indices, ['restaurantName', 'address', 'description', 'website']]
    results['similarity_score'] = cosine_similarities[top_doc_indices]
    return results



### Testing

In [27]:
# Test the conjunctive query
query = "modern seasonal cusine"
conjunctive_results = conjunctive_query(query)
print(conjunctive_results)
display(conjunctive_results)
# Test the ranked query
ranked_results = ranked_query(query, top_k=5)
print(ranked_results)
display(ranked_results)


                       restaurantName  \
1538                             Olmo   
1156                        Terramira   
135   Cappuccini Cucina San Francesco   
1160                         ConTatto   
265                         Chichibio   
...                               ...   
1652                            Razzo   
117                          La Botte   
1786          Materia | Spazio Cucina   
1277                      La Bandiera   
1150                      Tre Scalini   

                                             address  \
1538  Piazza della Chiesa 7 loc. San Pietro all'Olmo   
1156                        piazza della Vittoria 13   
135                                via Cappuccini 54   
1160                                 via Gioberti 11   
265                          via Guglielmo Marconi 1   
...                                              ...   
1652                           via Andrea Doria 17/f   
117                         via Giuseppe Garibaldi 8   
178

Unnamed: 0,restaurantName,address,description,website
1538,Olmo,Piazza della Chiesa 7 loc. San Pietro all'Olmo,Olmo takes its name from the large elm tree th...,http://cucinapop.do
1156,Terramira,piazza della Vittoria 13,"Having gained valuable experience elsewhere, t...",https://terramira.it
135,Cappuccini Cucina San Francesco,via Cappuccini 54,"Housed in the resort of the same name, this el...",https://www.cappuccini.it/
1160,ConTatto,via Gioberti 11,Situated in a small town just a few kilometres...,http://www.contattoristorante.it
265,Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant st...",
...,...,...,...,...
1652,Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/
117,La Botte,via Giuseppe Garibaldi 8,A modern and welcoming contemporary bistro sit...,http://www.trattorialabottestresa.it
1786,Materia | Spazio Cucina,via Teatro Massimo 29,The entrance to this restaurant is typical of ...,https://www.materiaspaziocucina.it/
1277,La Bandiera,contrada Pastini 4,Although it takes a while to reach this restau...,https://www.labandiera.it/


     restaurantName                                  address  \
117        La Botte                 via Giuseppe Garibaldi 8   
1359           Saur                     via Filippo Turati 8   
1652          Razzo                    via Andrea Doria 17/f   
656    Piccolo Lord              corso San Maurizio 69 bis/g   
1750       La Valle  via Umberto I 25 località Valle Sauglio   

                                            description  \
117   A modern and welcoming contemporary bistro sit...   
1359  In a tiny rural village, this contemporary, al...   
1652  A quiet restaurant with a relaxed, young and m...   
656   Professional service in a welcoming, modern re...   
1750  A well - run restaurant in a quiet area just o...   

                                    website  similarity_score  
117    http://www.trattorialabottestresa.it          0.280463  
1359              https://ristorantesaur.it          0.280150  
1652                 https://vadoarazzo.it/          0.252540  
656 

Unnamed: 0,restaurantName,address,description,website,similarity_score
117,La Botte,via Giuseppe Garibaldi 8,A modern and welcoming contemporary bistro sit...,http://www.trattorialabottestresa.it,0.280463
1359,Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, al...",https://ristorantesaur.it,0.28015
1652,Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/,0.25254
656,Piccolo Lord,corso San Maurizio 69 bis/g,"Professional service in a welcoming, modern re...",https://www.ristorantepiccololord.it/,0.231222
1750,La Valle,via Umberto I 25 località Valle Sauglio,A well - run restaurant in a quiet area just o...,https://www.ristorantelavalle.it/,0.220168
