
## **1. Data Collection**
### **1.1. Get the list of Michelin restaurants**

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from IPython.display import display

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
} # user agent is used to simulate that the http request comes from a real web browser, this prevent the server from blocking requests

def guide_michelin(): # 2037
        links = []
        for i in range(1,101): #100
            link = "https://guide.michelin.com/en/it/restaurants/page/{}".format(i)
            try:
                response = requests.get(link, headers=headers)
            except Exception as e:
                print(f"{e} \n {link}")
                continue
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                section = soup.find('div', class_="row restaurant__list-row js-restaurant__list_items")
                if section:  
                    for a_tag in section.find_all('a', href=True):
                        href = 'https://guide.michelin.com' + a_tag['href']
                        if href not in links and "/restaurant/" in href: 
                            links.append(href)
            else:
                print(f"Failed to retrieve page {i}")    
        return links

url_set = guide_michelin()
print(len(url_set))

In [3]:
with open('links.txt', 'w') as f:
    for url in url_set:
        f.write(url + '\n')

### **1.2. Crawl Michelin restaurant pages**

In [4]:
if not os.path.exists('pages'):
    os.makedirs('pages')

with open('links.txt', 'r') as f:
    urls = f.read().splitlines()

# Create directories and save HTML documents
for index, url in enumerate(urls):
    page_number = index // 20 + 1
    directory = os.path.join('pages', f'page_{page_number}')
    if not os.path.exists(directory):
        os.makedirs(directory)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            file_path = os.path.join(directory, f'document_{index}.html')
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
        else:
            print(f"Failed to retrieve {url}")
    except Exception as e:
        print(f"Error fetching {url}: {e}")

print("HTML documents saved successfully.")

In [5]:
dir_paths = [os.path.join('pages', dir) for dir in os.listdir('pages')]
len(dir_paths)

100

### **1.3. Parse downloaded pages**

In [17]:
# Function to extract restaurant details from HTML content
def extract_restaurant_details(content):
    
    # Extract the restaurant name
    name = content.find('h1', class_='data-sheet__title').get_text(strip=True) if content.find('h1', class_='data-sheet__title') else ""
    
    # Extract the first row of basic information
    firstRow = content.find_all("div", class_="data-sheet__block--text")[0].get_text(strip=True)
    #firstRow = content.find("div", class_="data-sheet__block--text").get_text(strip=True)
    firstRow_list = [info.strip() for info in firstRow.split(",")]

    address = " ".join(firstRow_list[:-3]) if len(firstRow_list) > 3 else ""
    city = firstRow_list[-3] if len(firstRow_list) > 2 else ""
    postalCode = firstRow_list[-2] if len(firstRow_list) > 1 else ""
    country = firstRow_list[-1] if firstRow_list else ""

    # Extract the second row of basic information
    secondRow = content.find_all("div", class_="data-sheet__block--text")[1].get_text(strip=True)
    #secondRow = content.find("div", class_="data-sheet__block--text").get_text(strip=True)
    secondRow_list = [info.strip() for info in secondRow.split("·")]

    priceRange = secondRow_list[0] if secondRow_list else ""
    cuisineType = secondRow_list[1] if len(secondRow_list) > 1 else ""

    # Extract the description
    description = content.find("div", class_="data-sheet__description").get_text(strip=True) if content.find("div", class_="data-sheet__description") else ""

    # Extract facilities and services
    facilitiesServices_div = content.find_all("div", class_="col col-12 col-lg-6")
    # facilitiesServices_div = content.find("div", class_="col col-12 col-lg-6")
    facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div[0].find_all("li")] if facilitiesServices_div else []
    # facilitiesServices = [li.get_text(strip=True) for li in facilitiesServices_div.find("li")] if facilitiesServices_div else []

    # Extract credit card information
    creditCards_div = content.find("div", class_="restaurant-details__services--info")
    creditCards = [os.path.basename(img["data-src"]).split("-")[0] for img in creditCards_div.find_all("img")] if creditCards_div else []

    # Extract phone number
    phoneNumber = content.find("span", attrs={"x-ms-format-detection": "none"}).get_text(strip=True) if content.find("span", attrs={"x-ms-format-detection": "none"}) else ""

    # Extract website
    website_div = content.find("div", class_="collapse__block-item link-item")
    website = website_div.find("a", class_="link js-dtm-link")["href"] if website_div and website_div.find("a", class_="link js-dtm-link") else ""

    # Return the extracted data as a dictionary
    return {
        "restaurantName": name,
        "address": address,
        "city": city,
        "postalCode": postalCode,
        "country": country,
        "priceRange": priceRange,
        "cuisineType": cuisineType,
        "description": description,
        "facilitiesServices": facilitiesServices,
        "creditCards": creditCards,
        "phoneNumber": phoneNumber,
        "website": website
    }

# Collecting data from all HTML files
#folder_paths = [d for d in os.listdir('pages') if os.path.isdir(d) and d.startswith("page_")]
dir_paths = [os.path.join('pages', dir) for dir in os.listdir('pages')]

data = []
for dir in dir_paths:
    for html_file in os.listdir(dir):
        if html_file.endswith(".html"):
            with open(os.path.join(dir, html_file), "r", encoding="utf-8") as file:
                soup = BeautifulSoup(file, "html.parser")
                restaurant_details = extract_restaurant_details(soup)
                data.append(restaurant_details)

# Create a DataFrame from the data list
df = pd.DataFrame(data)

df.columns = ["restaurantName", "address", "city", "postalCode", "country", "priceRange", "cuisineType", "description", "facilitiesServices", "creditCards", "phoneNumber", "website"]


In [20]:
# Display the DataFrame
display(df)

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,San Lorenzo,piazza Sordini 6,Spoleto,06049,Italy,€€,"Seafood, Traditional Cuisine","Situated within the Clitunno hotel, this well-...","[Air conditioning, Terrace, Wheelchair access]","[amex, mastercard, visa]",+39 0743 223340,https://www.ristorantesanlorenzo.com/it/
1,Dalla Libera,via Farra 52,Sernaglia della Battaglia,31020,Italy,€,"Country cooking, Seasonal Cuisine","At this restaurant, an American-style barbecue...","[Car park, Interesting wine list, Terrace]","[amex, mastercard, visa]",+39 0438 966295,http://www.trattoriadallalibera.it
2,La Notizia 53,via Caravaggio 53/55,Naples,80126,Italy,€,Pizza,Although situated away from the tourist centre...,[Air conditioning],"[amex, maestrocard, mastercard, visa]",+39 081 714 2155,http://www.pizzarialanotizia.com
3,Madonnina del Pescatore,via Lungomare Italia 11,Marzocca,60019,Italy,€€€€,"Creative, Contemporary",It’s now forty years since the Madonnina opene...,"[Air conditioning, Great view, Interesting win...","[amex, mastercard, visa]",+39 071 698267,https://www.morenocedroni.it/
4,Da Vincenzo,viale Pasitea 172/178,Positano,84017,Italy,€€,"Campanian, Traditional Cuisine",It's never easy to find the right balance betw...,"[Air conditioning, Terrace]","[amex, mastercard, visa]",+39 089 875128,https://www.davincenzo.it/
...,...,...,...,...,...,...,...,...,...,...,...,...
1978,Foresta,via Litoranea 2,Marina di Pisa,56128,Italy,€€€,"Seafood, Classic Cuisine","Overlooking the Tyrrhenian Sea, all the tables...","[Air conditioning, Great view, Terrace, Wheelc...","[amex, mastercard, visa]",+39 050 35082,https://www.ristoranteforesta.com/
1979,Oberraindlhof,Raindl 49,Madonna di Senales,39020,Italy,€€,"Traditional Cuisine, Regional Cuisine",The family that run this restaurant occupying ...,"[Car park, Great view, Interesting wine list, ...","[amex, mastercard, visa]",+39 0473 679131,https://www.oberraindlhof.com/it/buongustaio
1980,Antica Trattoria Gianna,via Maggiore 12,Recorfano,26030,Italy,€,"Lombardian, Country cooking","A delightful family trattoria offering simple,...","[Air conditioning, Terrace]","[amex, dinersclub, mastercard, visa]",+39 0375 98351,
1981,Cavallino,corso Romita 83,Tortona,15057,Italy,€€€,Modern Cuisine,Three talented young entrepreneurs who are pas...,"[Air conditioning, Car park, Interesting wine ...","[amex, mastercard, visa]",+39 0131 862308,http://www.cavallino-tortona.it


In [19]:
# we create a tsv file for each df row                        inside proper directories

if not os.path.exists('tsv_files'):
    os.makedirs('tsv_files')

for i,row in df.iterrows():
    single_row_df = pd.DataFrame([row])
    # page_number = i//20+1
    # dir = os.path.join('tsv_files', f'page_{page_number}')
    # # if not os.path.exists(directory):
    # #     os.makedirs(directory)
    filename = os.path.join('tsv_files', f'restaurant_{i}.tsv')
    single_row_df.to_csv(filename, sep='\t', index=False)
    
#print("TSV files created successfully.")
    