In [None]:
import subprocess
import sys

# List of required packages
required_packages = ['requests', 'beautifulsoup4', 'pandas']

# Function to install packages if they are not already installed
def install_package(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install any missing packages
for package in required_packages:
    install_package(package)

In [3]:
import requests
from bs4 import BeautifulSoup
import time
import os
import glob
import pandas as pd


In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
start_url = "https://guide.michelin.com/en/it/restaurants"
base_url = "https://guide.michelin.com"
next_page = start_url
link_list = []
while next_page:
    # Request page content
    response = requests.get(next_page,verify=False, headers=headers)
    soup = BeautifulSoup(response.content, features="lxml")
    # Find all restaurant links on the current page
    for link in soup.select("a.link"):
        href = link.get("href")
        if href and "/restaurant/" in href:
            link_list.append(base_url + href)
    # Look for the 'Next' button to proceed to the next page
    next_button = soup.find_all("a", class_="btn btn-outline-secondary btn-sm btn-carousel__link", href=True)
    if next_button:
        for content in next_button:
            if content.find("span", class_="icon fal fa-angle-right"):
                next_page = base_url+content["href"]
                break
            else:
                next_page = None

    else:
        next_page = None


# Display the collected links
print(f"Found {len(link_list)} restaurants:")
# Save to a text file
with open("urls.txt", "w") as file:
    for url in link_list:
        file.write(f"{url}\n")


In [None]:
# it is gonna take more 10 minutes
for index, link in enumerate(link_list):
    cnt = requests.get(link, verify=False, headers=headers)
    if cnt.status_code==200:
        html = BeautifulSoup(cnt.content, features="lxml")
        # Define the name of the subfolder and the filename
        subfolder = f"HTML/Page {str((index+20)//20)}"
        filename = f"{(link[link.rfind('/') + 1:]).replace('-', ' ')}.html"
        file_path = os.path.join(subfolder, filename)

        # Check if the subfolder exists, create it if it doesn't
        if not os.path.exists(subfolder):
            os.makedirs(subfolder)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(html.prettify())
    else:
        print("Request denied!")
        break

In [4]:

# Initialize an empty list to store the rows for DataFrame
data = []
# Define the base directory
base_directory = "HTML"
# Use glob to find all directories matching "Page*"
page_folders = glob.glob(os.path.join(base_directory, "Page *"))

# Loop through each Page* directory
for page_folder in page_folders:
    # Get all HTML files in the current Page* directory
    html_files = glob.glob(os.path.join(page_folder, "*.html"))

    # Read each HTML file
    for html_file in html_files:
        with open(html_file, "r", encoding='utf-8') as file:  # Ensure correct encoding
            content = BeautifulSoup(file.read(), "html.parser")
            # Extract the required information
            restaurantName = content.find("h1",class_="data-sheet__title").get_text().strip() if content.find("h1",class_="data-sheet__title") else ""

            basic_info_first_row_list=content.findAll("div",class_="data-sheet__block--text")[0].text
            basic_info_first_row_striped_list = [info.strip() for info in basic_info_first_row_list.split(",")]
            address = " ".join(basic_info_first_row_striped_list[:-3]) if basic_info_first_row_striped_list[:-3] else ""
            city = basic_info_first_row_striped_list[-3] if basic_info_first_row_striped_list[-3] else ""
            postal_code = basic_info_first_row_striped_list[-2]  if basic_info_first_row_striped_list[-2] else ""
            country = basic_info_first_row_striped_list[-1]  if basic_info_first_row_striped_list[-1] else ""


            basic_info_second_row_list=content.findAll("div",class_="data-sheet__block--text")[1].text
            basic_info_second_row_striped_list = [info.strip() for info in basic_info_second_row_list.split("·")]

            priceRange = basic_info_second_row_striped_list[0] if basic_info_second_row_striped_list[0] else ""
            cuisineType = basic_info_second_row_striped_list[1]  if basic_info_second_row_striped_list[1] else ""

            description = content.find("div",class_="data-sheet__description").get_text().strip() if content.find("div",class_="data-sheet__description") else ""

            facilitiesServices_div = content.findAll("div", class_="col col-12 col-lg-6")
            facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div[0].find_all("li")] if facilitiesServices_div[0] else ""

            div_creditCard = content.find("div", class_="restaurant-details__services--info")

            creditCards = [os.path.basename(img['data-src']).split('-')[0] for img in div_creditCard.find_all("img")] if div_creditCard else ""


            phoneNumber = content.find("span", attrs={"x-ms-format-detection": "none"}).get_text().strip() if content.find("span", attrs={"x-ms-format-detection": "none"}) else ""


            div_website = content.find("div", class_="collapse__block-item link-item")

            # Find the <a> tag within this container and get the href attribute
            a_website = div_website.find("a", class_="link js-dtm-link") if div_website else ""
            website = a_website.get("href") if a_website!="" else ""


            # Append the extracted info as a new row to the list
            data.append([restaurantName,address,city,postal_code,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website])


# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=["restaurantName","Address","City","Postal Code","Country","Price Range","Cuisine Type","Description","facilitiesServices","creditCards","phoneNumber","website"])

display(df)



# Iterate through each row in the DataFrame
for i, row in df.iterrows():
    # Define the file name using the index
    file_name = f"restaurant_{i}.tsv"

    # Prepare row data as a single line with tab-separated values
    content =  f"{row['restaurantName']}\t{row['Address']}\t{row['City']}\t{row['Postal Code']}\t{row['Country']}\t{row['Price Range']}\t{row['Cuisine Type']}\t{row['Description']}\t{row['facilitiesServices']}\t{row['creditCards']}\t{row['phoneNumber']}\t{row['website']}\n"

    subfolder = f"tsv_files"
    file_path = os.path.join(subfolder, file_name)

    # Check if the subfolder exists, create it if it doesn't
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
    # Write the row data to the .tsv file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

    print(f"Created file: {file_name}")

Unnamed: 0,restaurantName,Address,City,Postal Code,Country,Price Range,Cuisine Type,Description,facilitiesServices,creditCards,phoneNumber,website
0,Roscioli,via dei Giubbonari 21,Rome,00186,Italy,€€,"Roman, Italian",This restaurant is part of one of the best foo...,"[Air conditioning, Interesting wine list]","[amex, dinersclub, mastercard, visa]",+39 06 687 5287,https://www.salumeriaroscioli.com/
1,Trattoria da Zamboni,via Santa Croce 73,Lapio,36057,Italy,€€,"Classic Cuisine, Italian Contemporary",Famous country trattoria: interiors in modern ...,"[Air conditioning, Car park, Great view, Inter...","[amex, dinersclub, mastercard, visa]",+39 0444 273079,https://www.trattoriazamboni.it/
2,ConTatto,via Gioberti 11,Frascati,00044,Italy,€€,Cuisine from Lazio,Situated in a small town just a few kilometres...,[Air conditioning],"[amex, dinersclub, mastercard, visa]",+39 06 2170 0957,http://www.contattoristorante.it
3,Emozioni,via Guglielmo Marconi 129,Campobasso,86100,Italy,€€,Contemporary,"Situated in the heart of the historic centre, ...",[Air conditioning],"[dinersclub, mastercard, visa]",+39 328 875 1903,http://www.ristoranteemozioni.com
4,Antica Osteria dei Camelì,via Marconi 13,Ambivere,24030,Italy,€€€,Modern Cuisine,Occupying an attractive farmhouse dating back ...,"[Air conditioning, Car park, Interesting wine ...","[amex, dinersclub, mastercard, visa]",+39 035 908000,https://www.anticaosteriadeicameli.it/
...,...,...,...,...,...,...,...,...,...,...,...,...
1978,La Villa,Contrada Cavallerizza SS 303 verso Rocchetta S...,Melfi,85025,Italy,€,"Cuisine from Basilicata, Traditional Cuisine",This country restaurant owes its welcoming atm...,"[Air conditioning, Car park]","[dinersclub, mastercard, visa]",+39 0972 236008,https://www.lavillamelfi.it/
1979,Lazaroun,via Del Platano 21,Santarcangelo di Romagna,47822,Italy,€€,"Cuisine from Romagna, Traditional Cuisine","The prototype of a typical Romagna restaurant,...","[Air conditioning, Terrace]","[amex, maestrocard, mastercard, visa]",+39 0541 624417,http://www.lazaroun.it
1980,Fracia,località Fracia,Teglio,23036,Italy,€,"Cuisine from Valtellina, Traditional Cuisine",Park the car and walk up a short track (about ...,[Terrace],"[amex, dinersclub, mastercard, visa]",+39 0342 482671,https://www.ristorantefracia.it/
1981,Casa Perbellini 12 Apostoli,vicolo Corticella San Marco 3,Verona,37121,Italy,€€€€,"Creative, Contemporary",Giancarlo Perbellini returns to his origins in...,"[Air conditioning, Counter dining, Interesting...","[amex, maestrocard, mastercard, visa]",+39 045 878 0860,http://www.casaperbellini.com


Created file: restaurant_0.tsv
Created file: restaurant_1.tsv
Created file: restaurant_2.tsv
Created file: restaurant_3.tsv
Created file: restaurant_4.tsv
Created file: restaurant_5.tsv
Created file: restaurant_6.tsv
Created file: restaurant_7.tsv
Created file: restaurant_8.tsv
Created file: restaurant_9.tsv
Created file: restaurant_10.tsv
Created file: restaurant_11.tsv
Created file: restaurant_12.tsv
Created file: restaurant_13.tsv
Created file: restaurant_14.tsv
Created file: restaurant_15.tsv
Created file: restaurant_16.tsv
Created file: restaurant_17.tsv
Created file: restaurant_18.tsv
Created file: restaurant_19.tsv
Created file: restaurant_20.tsv
Created file: restaurant_21.tsv
Created file: restaurant_22.tsv
Created file: restaurant_23.tsv
Created file: restaurant_24.tsv
Created file: restaurant_25.tsv
Created file: restaurant_26.tsv
Created file: restaurant_27.tsv
Created file: restaurant_28.tsv
Created file: restaurant_29.tsv
Created file: restaurant_30.tsv
Created file: rest

## ***2.0 Pre-processing:***

In [22]:
!pip install nltk

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/sezermezgil/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [48]:
from nltk.corpus import words

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_words = set(words.words())  # İngilizce kelime listesi

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text)
    words = [
        lemmatizer.lemmatize(word) 
        for word in words 
        if word not in stop_words and len(word) > 2 and word in english_words
    ]
    return words

In [49]:
df['Description'] = df['Description'].str.lower()
df['processed_description'] = df['Description'].apply(preprocess_text)
df['processed_description']

0       [restaurant, part, one, best, food, typical, r...
1       [famous, country, modern, style, view, large, ...
2       [situated, small, town, capital, restaurant, r...
3       [situated, heart, historic, quiet, restaurant,...
4       [attractive, farmhouse, dating, back, restaura...
                              ...                        
1978    [country, restaurant, welcoming, atmosphere, s...
1979    [prototype, typical, restaurant, run, efficien...
1980    [park, car, walk, short, track, rustic, stone,...
1981    [historic, restaurant, native, city, every, ch...
1982    [sea, table, restaurant, offer, sea, whether, ...
Name: processed_description, Length: 1983, dtype: object

In [50]:
df.head()

Unnamed: 0,restaurantName,Address,City,Postal Code,Country,Price Range,Cuisine Type,Description,facilitiesServices,creditCards,phoneNumber,website,processed_description
0,Roscioli,via dei Giubbonari 21,Rome,186,Italy,€€,"Roman, Italian",this restaurant is part of one of the best foo...,"[Air conditioning, Interesting wine list]","[amex, dinersclub, mastercard, visa]",+39 06 687 5287,https://www.salumeriaroscioli.com/,"[restaurant, part, one, best, food, typical, r..."
1,Trattoria da Zamboni,via Santa Croce 73,Lapio,36057,Italy,€€,"Classic Cuisine, Italian Contemporary",famous country trattoria: interiors in modern ...,"[Air conditioning, Car park, Great view, Inter...","[amex, dinersclub, mastercard, visa]",+39 0444 273079,https://www.trattoriazamboni.it/,"[famous, country, modern, style, view, large, ..."
2,ConTatto,via Gioberti 11,Frascati,44,Italy,€€,Cuisine from Lazio,situated in a small town just a few kilometres...,[Air conditioning],"[amex, dinersclub, mastercard, visa]",+39 06 2170 0957,http://www.contattoristorante.it,"[situated, small, town, capital, restaurant, r..."
3,Emozioni,via Guglielmo Marconi 129,Campobasso,86100,Italy,€€,Contemporary,"situated in the heart of the historic centre, ...",[Air conditioning],"[dinersclub, mastercard, visa]",+39 328 875 1903,http://www.ristoranteemozioni.com,"[situated, heart, historic, quiet, restaurant,..."
4,Antica Osteria dei Camelì,via Marconi 13,Ambivere,24030,Italy,€€€,Modern Cuisine,occupying an attractive farmhouse dating back ...,"[Air conditioning, Car park, Interesting wine ...","[amex, dinersclub, mastercard, visa]",+39 035 908000,https://www.anticaosteriadeicameli.it/,"[attractive, farmhouse, dating, back, restaura..."


## ***2.1 Conjuctive Query:***

In [51]:
import pandas as pd

vocab_dict = {}
term_id = 0

all_uniqe_words = set(word for description in df["processed_description"] for word in description)

for word in all_uniqe_words:
    vocab_dict[word] = term_id
    term_id+=1

vocab_dict

{'quinto': 0,
 'concentrate': 1,
 'commercial': 2,
 'barge': 3,
 'ostentatious': 4,
 'ran': 5,
 'glory': 6,
 'historic': 7,
 'company': 8,
 'starter': 9,
 'rump': 10,
 'affordable': 11,
 'avocado': 12,
 'beer': 13,
 'undisputed': 14,
 'beverage': 15,
 'reviving': 16,
 'rasa': 17,
 'silent': 18,
 'rabbit': 19,
 'young': 20,
 'seta': 21,
 'bread': 22,
 'considering': 23,
 'guarded': 24,
 'elevating': 25,
 'radish': 26,
 'comfortable': 27,
 'gulf': 28,
 'mandatory': 29,
 'seared': 30,
 'seminary': 31,
 'whisky': 32,
 'significantly': 33,
 'meet': 34,
 'knowledgeable': 35,
 'let': 36,
 'always': 37,
 'younger': 38,
 'group': 39,
 'right': 40,
 'relief': 41,
 'caval': 42,
 'paragon': 43,
 'relais': 44,
 'born': 45,
 'step': 46,
 'farmstead': 47,
 'structure': 48,
 'precious': 49,
 'catfish': 50,
 'intricate': 51,
 'six': 52,
 'touch': 53,
 'degree': 54,
 'woodruff': 55,
 'predominantly': 56,
 'preference': 57,
 'staff': 58,
 'boredom': 59,
 'brazier': 60,
 'province': 61,
 'successful': 62,

In [52]:
vocab_df = pd.DataFrame(list(vocab_dict.items()), columns=['term', 'term_id'])
vocab_df.to_csv('vocabulary.csv', index=False)

vocab_df.head()

Unnamed: 0,term,term_id
0,quinto,0
1,concentrate,1
2,commercial,2
3,barge,3
4,ostentatious,4


In [53]:
from collections import defaultdict
import json

inverted_idx = defaultdict(list)

for idx, description in enumerate(df['processed_description']):
    for word in description:
        term_id = vocab_dict[word]

        if idx not in inverted_idx[term_id]:
            inverted_idx[term_id].append(idx)


with open('inverted_index.json', 'w') as f:
    json.dump(inverted_idx, f)


In [54]:
def process_query(query, vocab_dict, inverted_index, df):
    # 1. Sorguyu işlemden geçir
    query_words = preprocess_text(query)
    
    # 2. Her sorgu teriminin term_id'sini al ve ilgili document_id'leri bul
    doc_sets = []
    for word in query_words:
        term_id = vocab_dict.get(word)
        if term_id is not None:
            doc_sets.append(set(inverted_index.get(term_id, [])))
    
    # 3. Tüm sorgu kelimeleri için kesişim al
    if doc_sets:
        result_docs = set.intersection(*doc_sets)
    else:
        result_docs = set()

    # 4. Sonuçları döndür
    results = df.loc[result_docs, ["restaurantName", "Address", "Description", "website"]]
    return results


In [55]:
query = "modern seasonal cuisine"
results = process_query(query, vocab_dict, inverted_idx, df)
results

  results = df.loc[result_docs, ["restaurantName", "Address", "Description", "website"]]


Unnamed: 0,restaurantName,Address,Description,website
781,Razzo,via Andrea Doria 17/f,"a quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/
533,Zum Löwen,via Tirolo 25,a charming restaurant housed in a skilfully re...,
407,San Michele,via Castello di Fagagna 33,situated next to the ruins of the old castle a...,http://sanmichele.restaurant
921,Ronchi Rò,località Cime di Dolegna 12,ronchi rò is an estate-cum-agriturismo surroun...,https://www.ronchiro.it
1562,Piccolo Lord,corso San Maurizio 69 bis/g,"professional service in a welcoming, modern re...",https://www.ristorantepiccololord.it/
923,Le Vie del Borgo,via alla Piazza 6,le vie del borgo is situated in a restored rus...,https://www.leviedelborgoguesthouse.it/
1572,Il Tino,via Monte Cadria 127,enjoying an attractive location in the nautilu...,https://www.ristoranteiltino.com/
167,Ca' Del Moro,località Erbin 31,situated within the la collina dei ciliegi win...,https://www.cadelmoro.wine/it
1075,Chichibio,via Guglielmo Marconi 1,"despite its lack of awards, this restaurant st...",
53,RistoFante,via Mazzini 41,the motto of this restaurant is “in step with ...,https://www.ristofante.it/


## ***2.2 Ranked Search Engine with TF-IDF and Cosine Similarity:***

BY ME


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_tfidf_inverted_index(df):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['processed_description'].apply(lambda x: ' '.join(x)))
    
    terms = vectorizer.get_feature_names_out()  
    term_to_id = {term: idx for idx, term in enumerate(terms)}  
    inverted_index = defaultdict(list)
    
    for doc_id in range(tfidf_matrix.shape[0]):  
        for term_id in tfidf_matrix[doc_id].nonzero()[1]:  
            tfidf_score = tfidf_matrix[doc_id, term_id]
            inverted_index[term_id].append((doc_id, tfidf_score))  
    
    return inverted_index, term_to_id, tfidf_matrix, vectorizer


In [57]:
inverted_index, term_to_id, tfidf_matrix, vectorizer = build_tfidf_inverted_index(df)

readable_inverted_index = {
    term: [(doc_id, round(tfidf_score, 3)) for doc_id, tfidf_score in inverted_index[term_id]]
    for term, term_id in term_to_id.items()
}

for term, doc_scores in readable_inverted_index.items():
    print(f"{term}: {doc_scores}")


abate: [(872, 0.214)]
abbey: [(341, 0.368), (993, 0.264), (1074, 0.335), (1868, 0.203)]
abbreviation: [(738, 0.466)]
ability: [(514, 0.145), (719, 0.2), (745, 0.194), (841, 0.181)]
able: [(1587, 0.249), (1786, 0.235), (1790, 0.269), (1850, 0.192)]
ably: [(120, 0.245), (122, 0.185), (228, 0.19), (362, 0.148), (590, 0.156), (678, 0.196), (727, 0.167), (745, 0.161), (843, 0.152), (1012, 0.148), (1085, 0.2), (1244, 0.186), (1263, 0.129), (1517, 0.152), (1716, 0.159)]
abroad: [(284, 0.375), (313, 0.21), (328, 0.207), (700, 0.24), (765, 0.25), (966, 0.159), (1142, 0.37), (1150, 0.133), (1156, 0.168), (1239, 0.256), (1305, 0.323), (1507, 0.178), (1572, 0.152), (1748, 0.188), (1762, 0.24)]
absinthe: [(913, 0.214)]
absolute: [(526, 0.173), (1197, 0.284)]
absolutely: [(296, 0.236), (1176, 0.265), (1654, 0.27)]
abundance: [(1123, 0.242), (1325, 0.343), (1617, 0.179)]
abundant: [(525, 0.288), (531, 0.271), (913, 0.17), (1064, 0.246), (1160, 0.285), (1289, 0.25), (1340, 0.176), (1599, 0.237), (1871

In [58]:
import numpy as np

def search_query(query, tfidf_matrix, vectorizer, df, top_k=5):
    query_tfidf = vectorizer.transform([query]) 
    
    cosine_similarities = (tfidf_matrix @ query_tfidf.T).toarray().flatten()  
    
    relevant_docs = np.argsort(-cosine_similarities)[:top_k]  
    
    results = df.loc[relevant_docs, ["restaurantName", "Address", "Description", "website"]]
    results["Similarity Score"] = cosine_similarities[relevant_docs] 
    
    return results


***TEST***

In [59]:
inverted_index, term_to_id, tfidf_matrix, vectorizer = build_tfidf_inverted_index(df)

query = "modern seasonal cuisine"
results = search_query(query, tfidf_matrix, vectorizer, df)

results

Unnamed: 0,restaurantName,Address,Description,website,Similarity Score
500,Saur,via Filippo Turati 8,"in a tiny rural village, this contemporary, al...",https://ristorantesaur.it,0.344817
545,20Tre,via David Chiossone 20 r,situated in the heart of genoa’s historic cent...,https://www.ristorante20tregenova.it/,0.336543
1111,La Botte,via Giuseppe Garibaldi 8,a modern and welcoming contemporary bistro sit...,http://www.trattorialabottestresa.it,0.316122
1487,Retrobottega,via della Stelletta 4,minimalist decor and clean lines characterise ...,https://www.retro-bottega.com,0.297534
781,Razzo,via Andrea Doria 17/f,"a quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/,0.284057
