In [None]:
import subprocess
import sys

# List of required packages
required_packages = ['requests', 'beautifulsoup4', 'pandas']

# Function to install packages if they are not already installed
def install_package(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install any missing packages
for package in required_packages:
    install_package(package)

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import os
import glob
import pandas as pd


In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36'}
start_url = "https://guide.michelin.com/en/it/restauranttttts"
base_url = "https://guide.michelin.com"
next_page = start_url
link_list = []
while next_page:
    # Request page content
    response = requests.get(next_page,verify=False, headers=headers)
    soup = BeautifulSoup(response.content, features="lxml")
    # Find all restaurant links on the current page
    for link in soup.select("a.link"):
        href = link.get("href")
        if href and "/restaurant/" in href:
            link_list.append(base_url + href)
    # Look for the 'Next' button to proceed to the next page
    next_button = soup.find_all("a", class_="btn btn-outline-secondary btn-sm btn-carousel__link", href=True)
    if next_button:
        for content in next_button:
            if content.find("span", class_="icon fal fa-angle-right"):
                next_page = base_url+content["href"]
                break
            else:
                next_page = None

    else:
        next_page = None


# Display the collected links
print(f"Found {len(link_list)} restaurants:")
# Save to a text file
with open("urls.txt", "w") as file:
    for url in link_list:
        file.write(f"{url}\n")


In [None]:
# it is gonna take more 10 minutes
for index, link in enumerate(link_list):
    cnt = requests.get(link, verify=False, headers=headers)
    if cnt.status_code==200:
        html = BeautifulSoup(cnt.content, features="lxml")
        # Define the name of the subfolder and the filename
        subfolder = f"HTML/Page {str((index+20)//20)}"
        filename = f"{(link[link.rfind('/') + 1:]).replace('-', ' ')}.html"
        file_path = os.path.join(subfolder, filename)

        # Check if the subfolder exists, create it if it doesn't
        if not os.path.exists(subfolder):
            os.makedirs(subfolder)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(html.prettify())
    else:
        print("Request denied!")
        break

In [None]:

# Initialize an empty list to store the rows for DataFrame
data = []
# Define the base directory
base_directory = "HTML"
# Use glob to find all directories matching "Page*"
page_folders = glob.glob(os.path.join(base_directory, "Page *"))

# Loop through each Page* directory
for page_folder in page_folders:
    # Get all HTML files in the current Page* directory
    html_files = glob.glob(os.path.join(page_folder, "*.html"))

    # Read each HTML file
    for html_file in html_files:
        with open(html_file, "r", encoding='utf-8') as file:  # Ensure correct encoding
            content = BeautifulSoup(file.read(), "html.parser")
            # Extract the required information
            restaurantName = content.find("h1",class_="data-sheet__title").get_text().strip() if content.find("h1",class_="data-sheet__title") else ""

            basic_info_first_row_list=content.findAll("div",class_="data-sheet__block--text")[0].text
            basic_info_first_row_striped_list = [info.strip() for info in basic_info_first_row_list.split(",")]
            address = " ".join(basic_info_first_row_striped_list[:-3]) if basic_info_first_row_striped_list[:-3] else ""
            city = basic_info_first_row_striped_list[-3] if basic_info_first_row_striped_list[-3] else ""
            postal_code = basic_info_first_row_striped_list[-2]  if basic_info_first_row_striped_list[-2] else ""
            country = basic_info_first_row_striped_list[-1]  if basic_info_first_row_striped_list[-1] else ""


            basic_info_second_row_list=content.findAll("div",class_="data-sheet__block--text")[1].text
            basic_info_second_row_striped_list = [info.strip() for info in basic_info_second_row_list.split("·")]

            priceRange = basic_info_second_row_striped_list[0] if basic_info_second_row_striped_list[0] else ""
            cuisineType = basic_info_second_row_striped_list[1]  if basic_info_second_row_striped_list[1] else ""

            description = content.find("div",class_="data-sheet__description").get_text().strip() if content.find("div",class_="data-sheet__description") else ""

            facilitiesServices_div = content.findAll("div", class_="col col-12 col-lg-6")
            facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div[0].find_all("li")] if facilitiesServices_div[0] else ""

            div_creditCard = content.find("div", class_="restaurant-details__services--info")

            creditCards = [os.path.basename(img['data-src']).split('-')[0] for img in div_creditCard.find_all("img")] if div_creditCard else ""


            phoneNumber = content.find("span", attrs={"x-ms-format-detection": "none"}).get_text().strip() if content.find("span", attrs={"x-ms-format-detection": "none"}) else ""


            div_website = content.find("div", class_="collapse__block-item link-item")

            # Find the <a> tag within this container and get the href attribute
            a_website = div_website.find("a", class_="link js-dtm-link") if div_website else ""
            website = a_website.get("href") if a_website!="" else ""


            # Append the extracted info as a new row to the list
            data.append([restaurantName,address,city,postal_code,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website])


# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=["restaurantName","Address","City","Postal Code","Country","Price Range","Cuisine Type","Description","facilitiesServices","creditCards","phoneNumber","website"])

display(df)



# Iterate through each row in the DataFrame
for i, row in df.iterrows():
    # Define the file name using the index
    file_name = f"restaurant_{i}.tsv"

    # Prepare row data as a single line with tab-separated values
    content =  f"{row['restaurantName']}\t{row['Address']}\t{row['City']}\t{row['Postal Code']}\t{row['Country']}\t{row['Price Range']}\t{row['Cuisine Type']}\t{row['Description']}\t{row['facilitiesServices']}\t{row['creditCards']}\t{row['phoneNumber']}\t{row['website']}\n"

    subfolder = f"tsv_files"
    file_path = os.path.join(subfolder, file_name)

    # Check if the subfolder exists, create it if it doesn't
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
    # Write the row data to the .tsv file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

    print(f"Created file: {file_name}")

## ***2.0 Pre-processing:***

In [None]:
!pip install nltk

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk



In [5]:
from nltk.corpus import words

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
english_words = set(words.words())  

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text)
    words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words and len(word) > 2 and word in english_words
    ]
    return words

In [None]:
df['Description'] = df['Description'].str.lower()
df['processed_description'] = df['Description'].apply(preprocess_text)
df['processed_description']

In [None]:
df.head()

## ***2.1 Conjuctive Query:***

In [None]:
import pandas as pd

vocab_dict = {}
term_id = 0

all_uniqe_words = set(word for description in df["processed_description"] for word in description)

for word in all_uniqe_words:
    vocab_dict[word] = term_id
    term_id+=1

vocab_dict

In [None]:
vocab_df = pd.DataFrame(list(vocab_dict.items()), columns=['term', 'term_id'])
vocab_df.to_csv('vocabulary.csv', index=False)

vocab_df.head()

In [10]:
from collections import defaultdict
import json

inverted_idx = defaultdict(list)

for idx, description in enumerate(df['processed_description']):
    for word in description:
        term_id = vocab_dict[word]

        if idx not in inverted_idx[term_id]:
            inverted_idx[term_id].append(idx)


with open('inverted_index.json', 'w') as f:
    json.dump(inverted_idx, f)


In [11]:
def process_query(query, vocab_dict, inverted_index, df):
    query_words = preprocess_text(query)

    doc_sets = []
    for word in query_words:
        term_id = vocab_dict.get(word)
        if term_id is not None:
            doc_sets.append(set(inverted_index.get(term_id, [])))

    if doc_sets:
        result_docs = set.intersection(*doc_sets)
    else:
        result_docs = set()

    results = df.loc[result_docs, ["restaurantName", "Address", "Description", "website"]]
    return results


In [None]:
query = "modern seasonal cuisine"
results = process_query(query, vocab_dict, inverted_idx, df)
results

## ***2.2 Ranked Search Engine with TF-IDF and Cosine Similarity:***

BY ME


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_tfidf_inverted_index(df):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['processed_description'].apply(lambda x: ' '.join(x)))

    terms = vectorizer.get_feature_names_out()
    term_to_id = {term: idx for idx, term in enumerate(terms)}
    inverted_index = defaultdict(list)

    for doc_id in range(tfidf_matrix.shape[0]):
        for term_id in tfidf_matrix[doc_id].nonzero()[1]:
            tfidf_score = tfidf_matrix[doc_id, term_id]
            inverted_index[term_id].append((doc_id, tfidf_score))

    return inverted_index, term_to_id, tfidf_matrix, vectorizer


In [None]:
inverted_index, term_to_id, tfidf_matrix, vectorizer = build_tfidf_inverted_index(df)

readable_inverted_index = {
    term: [(doc_id, round(tfidf_score, 3)) for doc_id, tfidf_score in inverted_index[term_id]]
    for term, term_id in term_to_id.items()
}

for term, doc_scores in readable_inverted_index.items():
    print(f"{term}: {doc_scores}")


In [14]:
import numpy as np

def search_query(query, tfidf_matrix, vectorizer, df, top_k=5):
    query_tfidf = vectorizer.transform([query])

    cosine_similarities = (tfidf_matrix @ query_tfidf.T).toarray().flatten()

    relevant_docs = np.argsort(-cosine_similarities)[:top_k]

    results = df.loc[relevant_docs, ["restaurantName", "Address", "Description", "website"]]
    results["Similarity Score"] = cosine_similarities[relevant_docs]

    return results


***TEST***

In [None]:
inverted_index, term_to_id, tfidf_matrix, vectorizer = build_tfidf_inverted_index(df)

query = "modern seasonal cuisine"
results = search_query(query, tfidf_matrix, vectorizer, df)

results

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def search_query_dscore(query, tfidf_matrix, vectorizer, df, top_k=5):
    query_tfidf = vectorizer.transform([query])

    cosine_similarities = (tfidf_matrix @ query_tfidf.T).toarray().flatten()

    return cosine_similarities

def calculate_score(doc, query, vectorizer, tfidf_matrix, cuisine_preferences, facility_preferences, price_preferences):
    score = 0

    # TF-IDF vector for the query
    query_tfidf = vectorizer.transform([query])


    doc_index = doc.name  # Index of the document in the dataframe
    doc_vec = tfidf_matrix[doc_index]

    #description score
    description_score = cosine_similarity(query_tfidf, doc_vec)[0, 0]  # cosine similarity
    score += description_score


    #cuisine type score
    for cuisine in cuisine_preferences:
        if cuisine.lower() in doc['Cuisine Type'].lower():
            score += 0.2

    #facilities score
    for facility in facility_preferences:
        if facility.lower() in [f.lower() for f in doc['facilitiesServices']]:
            score += 0.1

    #price range score
    for price in price_preferences:
        if price in doc['Price Range']:
            score += 0.2

    return score




In [17]:
import heapq

def ranked_restaurants(query, tfidf_matrix, vectorizer, df, top_k=5, cuisine_preferences=None, facility_preferences=None, price_preferences=None):
    # create an heap
    heap = []

    for doc_id, doc in df.iterrows():  # for each restaurant
        # personalized score
        score = calculate_score(
            doc,  # current restourant
            query,
            vectorizer,
            tfidf_matrix,
            cuisine_preferences or [],
            facility_preferences or [],
            price_preferences or []
        )

        # top k results in the heat
        if len(heap) < top_k:
            heapq.heappush(heap, (score, doc_id))  # Adding an element (score, ID doc)
        else:
            heapq.heappushpop(heap, (score, doc_id))  # Replace the smaller item if necessary

    # Sort the heap to get the results in descending order
    ranked_results = sorted(heap, key=lambda x: x[0], reverse=True)

    # Format the final results
    results = []
    for score, doc_id in ranked_results:
        row = df.iloc[doc_id]
        results.append({
            "restaurantName": row["restaurantName"],
            "Address": row["Address"],
            "Description": row["Description"],
            "website": row["website"],
            "custom_score": round(score, 3)
        })

    results_df2 = pd.DataFrame(results)
    return results_df2


In [None]:
from IPython.display import display

query = "modern seasonal cuisine"
cuisine_preferences = ["Italian", "French"]
facility_preferences = ["Terrace", "Air conditioning"]
price_preferences = ["$$", "$$$"]
top_k = 5

results_df = ranked_restaurants(query, tfidf_matrix, vectorizer, df, top_k, cuisine_preferences, facility_preferences, price_preferences)

# Use display for a tabular view
display(results_df)



**The new scoring function improves the results because it incorporates additional variables such as the type of cuisine, the services available and the price range. This allows us to obtain results that are more relevant to the user's preferences, which would otherwise have been ignored in the original scoring function. For example, a restaurant that meets preferences in terms of cuisine and price, but has a less detailed description, is now considered more relevant.**

In [55]:
import requests

def get_region(adres):
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        'q': adres,
        'format': 'json',
        'addressdetails': 1,
        'limit': 1
    }
    headers = {
        'User-Agent': 'sezermzgl@gmail.com'  
    }
    
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        json_data = response.json()
        if json_data:
            address = json_data[0].get('address', {})
            region = address.get('state')  
            return region
    return None


In [None]:
df['Region'] = None

for index, row in df.head(5).iterrows():
    adres = f"{row['Address']}, {row['Postal Code']}, {row['City']}, {row['Country']}"
    region = get_region(adres)
    
    df.at[index, 'Region'] = region
    print(f"Processed {row['restaurantName']}: Region = {region}")


In [None]:
df.head()