In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep

## Scraping

In [4]:
def scrape_page(page_n):
    url = f"https://www.quandoo.fi/en/helsinki?districtFilter=3637&bookable=true&onlySpecialOffers=false&page={page_n}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    rest_cards = soup.find_all(attrs={"data-qa": "merchant-card"})

    rest_names = [card.find('h3').text.strip() for card in rest_cards]
    rest_locations = [card.find(attrs={"data-qa": "merchant-location"}).text.strip() for card in rest_cards]
    # "Located in ... area" part removed in Location column.
    rest_locations = [location.replace('Located at ','').replace('area','') for location in rest_locations]
    
    rest_cuisines = [card.find(attrs={"data-qa": "merchant-card-cuisine"}).text.strip() for card in rest_cards]
    #Cleaned Cuisine's column data
    rest_cuisines = [cuisine.replace(" Restaurant","") for cuisine in rest_cuisines]

    rest_meals_boxes = [card.find(attrs={"data-qa": "merchant-meal"}) for card in rest_cards]
    rest_meals = [re.sub(r'^\s*Meals:\s*', '', box.text.strip()) if box else None for box in rest_meals_boxes]
    
    rest_rating_boxes = [card.find(attrs={"data-qa": "reviews-score"}) for card in rest_cards]
    rest_ratings = [float(box.text.split('/')[0].strip()) if box else None for box in rest_rating_boxes]

    rest_review_boxes = [card.find(class_="sc-1atis9w-3 dfyExP") for card in rest_cards]
    rest_review_counts = [int(box.text.split()[0].strip()) if box else None for box in rest_review_boxes]

    rest_price_boxes = [card.find(class_=re.compile(r'.*price-indicator')) for card in rest_cards]
    rest_price_levels = [len(box.find_all(class_=re.compile(r'.*oGCHK'))) for box in rest_price_boxes]

    rest_page_urls = [card.find('a')['href'] for card in rest_cards]

    df = pd.DataFrame({'Name': rest_names, 
                    'Location': rest_locations,
                    'Cuisine': rest_cuisines,
                    'Meals': rest_meals,
                    'Price Level (out of 4)': rest_price_levels,
                    'Rating (out of 6)': rest_ratings,
                    'Review Count': rest_review_counts,
                    'Page URL': rest_page_urls
                    })

    return df

In [4]:
restaurant_data = pd.DataFrame()

for page_n in range(1, 15):
    page_data = scrape_page(page_n)
    restaurant_data = pd.concat([restaurant_data, page_data], ignore_index=True)

restaurant_data

NameError: name 'scrape_page' is not defined

In [6]:
restaurant_data.to_csv("restaurant_data.csv", sep="\t")

## Review Scraping

In [2]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
109,Ravintola Piilo,Hietalahti,Italian,"Breakfast, Lunch, Brunch, Dinner",0,4.7,,/en/place/ravintola-piilo-91693/about
335,Bistro Telakka,Lauttasaari,International,"Lunch, Dessert, Dinner",0,5.1,,/en/place/bistro-telakka-92514/menu
64,Kissakahvila Helkatti,Kamppi,Eat & Drink,"Lunch, Cake & Coffee",0,5.0,,/en/place/kissakahvila-helkatti-106475/menu
239,Vietologie,Töölö,Vietnamese,"Lunch, Dinner",0,,,/en/place/vietologie-109409/menu
21,Ravintola Muru,City Centre,French,Dinner,0,5.4,,/en/place/ravintola-muru-9646/menu
156,Bistro Palo,Malmi,International,"Lunch, Dinner",0,5.0,,/en/place/bistro-palo-96188/menu
217,Aito Fresh,City Centre,Asian Fusion,"Lunch, Dinner",0,4.7,,/en/place/aito-fresh-104403/menu
277,Ravintola Rara,Pikku Huopalahti,Nepalese,"Lunch, Dinner",0,5.7,,/en/place/ravintola-rara-97770/menu
233,Chao Phraya Helsinki - Thai Restaurant,Hietalahti,Thai,"Lunch, Dinner",0,4.0,,/en/place/chao-phraya-helsinki-thai-restaurant...
232,Amex Exclusive Lunch: Pastis,Kaartinkaupunki,French,,0,5.0,,/en/place/amex-exclusive-lunch-pastis-104118/a...


In [4]:
def get_reviews_per_rest(rest_name, rel_path):
    review_texts, review_scores = [], []
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/reviews"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    pagination_btns = soup.find_all('button', attrs={"data-qa": "horizontal-filter-button"})
    n_pages = pagination_btns[-2].text if len(pagination_btns) > 3 else None
    n_pages = int(n_pages) if n_pages else 1
    page_url += "?reviewPage="
    for i in range(1, n_pages + 1):
        response = requests.get(page_url + str(i))
        soup = BeautifulSoup(response.content, 'html.parser')
        review_blocks = soup.find_all('div', attrs={"data-name": "shared-review"})
        r_scores = [block.find('span', attrs={"data-qa": "review-score"}).text for block in review_blocks]
        r_scores = [int(score.split('/')[0]) for score in r_scores]
        review_scores.extend(r_scores)
        r_texts = [block.find('p', attrs={"data-qa": "review-description"}).text for block in review_blocks]
        review_texts.extend(r_texts)
    print(rest_name, "–", len(review_scores))  
    return pd.DataFrame({'Restaurant': rest_name, 
                         'Review Score': review_scores, 
                         'Review Text': review_texts})

In [None]:
review_data = pd.DataFrame()

for _, row in data.iterrows():
    rest_reviews = get_reviews_per_rest(row['Name'], row['Page URL'])
    review_data = pd.concat([review_data, rest_reviews], ignore_index=True)

review_data.to_csv("review_data.csv", sep="\t")

## Menu highlight Scraping

In [11]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
164,Pho Nokis,Kamppi,Vietnamese,"Lunch, Dinner",0,5.5,,/en/place/pho-nokis-100857/menu
236,Mashiro Töölö,Töölö,Sushi,"Buffet, Dinner",0,5.0,,/en/place/mashiro-toolo-87853/about
286,Satama Bar & Bistro Herttoniemi,Herttoniemi,International,"Lunch, Dessert, Dinner",0,4.5,,/en/place/satama-bar-bistro-herttoniemi-108342...
122,Relove Freda,Punavuori,European,"Lunch, Dessert, Brunch",0,5.4,,/en/place/relove-freda-95229/menu
289,Akhanda Nepalilainen Ravintola,Pitäjänmäki,Nepalese,"Lunch, Dinner",0,5.0,,/en/place/akhanda-nepalilainen-ravintola-10388...
72,Mamma Rosa,Töölö,International,"Lunch, Dinner",0,5.0,,/en/place/mamma-rosa-24808/menu
171,Ravintola Lukla,Töölö,Nepalese,"Lunch, Dinner",0,5.5,,/en/place/ravintola-lukla-96005/menu
166,Lopez Tacos Kamppi,Kamppi,Mexican,"Lunch, Dinner",0,4.8,,/en/place/lopez-tacos-kamppi-105479/menu
55,The Tart,Kaartinkaupunki,International,"Lunch, Dessert, Dinner",0,5.2,,/en/place/the-tart-108248/menu
337,Black Sea Kitchen,Kaartinkaupunki,Georgian,"Lunch, Dinner",0,,,/en/place/black-sea-kitchen-109077/about


In [84]:
def get_menu_highlights(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/menu"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    food_tags = extract_food_restriction_tags(soup)

    items = []

    sections = soup.find_all("div", attrs={"data-name": "menu-section"})
    for section in sections:
        cat_tag = section.select_one('h4[data-qa^="menu-category-name"]')
        category = cat_tag.get_text(strip=True) if cat_tag else ""

        for dish_div in section.select('div[data-qa*="item-"]'):
            name_tag = dish_div.select_one("h5")
            desc_tag = dish_div.select_one("p[data-qa*='item-description']") 

            name = name_tag.get_text(strip=True) if name_tag else ""
            desc = desc_tag.get_text(strip=True) if desc_tag else ""

            if not name:
                continue

            menu_text = f"{category} | {name}"
            if desc:
                menu_text += f" : {desc}"

            items.append({
                "Restaurant": rest_name,
                "Food restrictions": food_tags,
                "Menu": menu_text
            })

    
    return pd.DataFrame(items)

def extract_food_restriction_tags(soup):
    tags = []
    tag_container = soup.find("div", attrs={"data-qa": "food-restriction-tags"})

    if not tag_container:
        return tags
    
    for p in tag_container.find_all("p"):
        text = p.text.strip()
        if text and not text.lower().startswith("includes"):
            tags.append(text)

    return tags

In [None]:
all_menus = []

for _, row in data.iterrows():
    df = get_menu_highlights(row["Name"], row["Page URL"])
    all_menus.append(df)

menu_df = pd.concat(all_menus, ignore_index=True)
mask = menu_df['Restaurant'] != menu_df['Restaurant'].shift(1)
clean_view = menu_df.copy()
clean_view.loc[~mask, ['Restaurant', 'Food restrictions']] = ''

clean_view.to_csv("menu_highlights.csv", sep="\t", index=False)



## Restaurants' pictures scraping

In [5]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
27,Ravintola Lehtovaara,Töölö,International,"Lunch, Dinner",0,5.4,,/en/place/ravintola-lehtovaara-11619/menu
42,Alfons Pizza,Ullanlinna,Pizza,Dinner,0,5.3,,/en/place/alfons-pizza-102482/menu
265,Ravintola Veturitallit,Pasila,European,"Dessert, Dinner",0,4.3,,/en/place/ravintola-veturitallit-106147/about
112,Bröd Punavuori,Punavuori,Scandinavian,"Breakfast, Lunch, Dessert, Dinner",0,4.9,,/en/place/brod-punavuori-63308/menu
180,La Galleria,Kruununhaka,Pizza,Dinner,0,5.0,,/en/place/la-galleria-109402/about
29,Piccola Trattoria Kalasatama,Kalasatama,Italian,"Lunch, Dinner",0,5.5,,/en/place/piccola-trattoria-kalasatama-100346/...
0,Luovuus kukkii kaaoksesta,Kaartinkaupunki,International,Dinner,0,5.8,,/en/place/luovuus-kukkii-kaaoksesta-90397/menu
313,Ravintola Makasiini - Grand Marina,Katajanokka,Scandinavian,"Breakfast, Dessert, Dinner",0,4.5,,/en/place/ravintola-makasiini-scandic-grand-ma...
58,Ravintola Santa Fé Helsinki,City Centre,Mexican,"Lunch, Dinner",0,5.1,,/en/place/ravintola-santa-fe-30462/menu
171,Ravintola Lukla,Töölö,Nepalese,"Lunch, Dinner",0,5.5,,/en/place/ravintola-lukla-96005/menu


In [None]:
from urllib.parse import urlparse, urljoin
import json
    
def extract_restaurant_images(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/photos#content"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    images = []

    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)

            if isinstance(data, dict):
                items = data.get("itemListElement", [])
            elif isinstance(data, list):
                items = data
            else:
                continue

            for entry in items:
                if not isinstance(entry, dict):
                    continue
                img_data = None

                # Pattern 1: {"@type": "ListItem", "item": {"@type": "ImageObject", ...}}
                if "item" in entry:
                    item_content = entry["item"]
                    if isinstance(item_content, dict) and item_content.get("@type") == "ImageObject":
                        img_data = item_content

                # Pattern 2: direct ImageObject in the list
                elif entry.get("@type") == "ImageObject":
                    img_data = entry

                if img_data:
                    url = img_data.get("contentURL")
                    if url and "qul.imgix.net" in url:
                        clean_url = url.split("?")[0]
                        images.append(clean_url)

        except (json.JSONDecodeError, TypeError, AttributeError) as e:
            print(f"JSON parsing issue in one script: {e}")
            continue

    # Remove duplicates
    seen = set()
    unique_images = []
    for url in images:
        if url not in seen:
            seen.add(url)
            unique_images.append(url)

    return {
        "restaurant": rest_name,
        "images": unique_images,
    }


In [None]:
all_results = []


for _, row in data.iterrows():
    rest_name = row["Name"]
    page_url  = row["Page URL"]
    result = extract_restaurant_images(rest_name,page_url)
    
    images_str = ", ".join(result["images"]) if result["images"] else ""
    
    all_results.append({
        "Restaurant": rest_name,
        "Images": images_str
    })

df_pictures = pd.DataFrame(all_results)
df_pictures.to_csv("restaurant_pictures.csv", sep="\t", index=False)

## Restaurants' addresses Scraping

In [45]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
data.sample(10)

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL
110,The Tower - Wine & Craft Beer,Pasila,Scandinavian,"Lunch, Dinner",0,4.7,,/en/place/the-tower-wine-craft-beer-85892/menu
30,Oishi 18 Katajanokka,Katajanokka,Sushi,"Lunch, Dinner",0,5.3,,/en/place/oishi-18-katajanokka-95412/menu
215,Harbour Tap & Taste,Kalasatama,Finnish,"Lunch, Dinner, Sunday lunch",0,5.8,,/en/place/harbour-tap-taste-108467/menu
24,Lappi Ravintola,City Centre,Finnish,"Dessert, Dinner",0,5.3,,/en/place/lappi-ravintola-9753/menu
45,Lie Mi Kallio,Kallio,Vietnamese,"Lunch, Dinner",0,5.2,,/en/place/lie-mi-kallio-64811/about
88,Relove Stockmann Helsinki,City Centre,European,"Breakfast, Lunch, Dessert, Brunch",0,4.9,,/en/place/relove-stockmann-helsinki-95230/menu
330,Wave Of Flavors,Vallila,Portuguese,"Lunch, Dessert, Dinner",0,5.1,,/en/place/waves-of-flavors-105468/menu
338,Mad Finn Brewing Co. Taproom Helsinki,Sompasaari,Pizza,Dinner,0,6.0,,/en/place/mad-finn-brewing-co-taproom-helsinki...
341,m/s King – Royal Line,Vuosaari,Scandinavian,"Buffet, Dinner",0,5.0,,/en/place/ms-king-royal-line-102408/menu
131,Casa Haga,Haaga,Spanish,"Tapas, Dessert, Dinner",0,4.7,,/en/place/casa-haga-100321/menu


In [None]:
def get_restaurant_address(rest_name,rel_path):
    page_url = "https://www.quandoo.fi" + '/'.join(rel_path.split('/')[:-1]) + "/about#content"
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    for p_tag in soup.find_all('p', class_='sc-bdnylu jqzuJW'):
        text_content = p_tag.get_text()
        if '00100' in text_content or 'Helsinki' in text_content:
            address_lines = []
            spans = p_tag.find_all('span')
            for span in spans:
                address_lines.append(span.get_text(strip=True))
        
            full_address = ', '.join(address_lines)
            return {
                "Restaurant": rest_name,
                "Address": full_address
            }
            break

  

In [None]:
all_addresses = []

for _, row in data.iterrows():
    rest_name = row["Name"]
    page_url  = row["Page URL"]
    result = get_restaurant_address(rest_name,page_url)    
    all_addresses.append(result)
    
df = pd.DataFrame(all_addresses)
df.to_csv("restaurant_addresses.csv", sep="\t", index=False)

## Translations

In [2]:
from data_processing import translate_batch, translate_list

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nesterenkojul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1638.51it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [None]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
reviews = pd.read_csv("review_data.csv", sep="\t", index_col=0)
translated_review_data = {}

for _, row in data.iterrows():
        rest_reviews = reviews[reviews.Restaurant == row.Name]["Review Text"]
        transl_reviews = translate_batch(rest_reviews)
        all_reviews = '\n'.join(transl_reviews)
        translated_review_data[row.Name] = all_reviews

In [None]:
pd.DataFrame({"Restaurant": translated_review_data.keys(), "Reviews": translated_review_data.values()}).to_csv("translated_review_data.csv", sep="\t")

In [3]:
reviews = pd.read_csv("review_data.csv", sep="\t", index_col=0)
reviews['Review Text'] = reviews['Review Text'].apply(lambda x: x.replace('\n', ' '))
reviews['Review Text Eng'] = translate_list(reviews['Review Text'].values)
reviews

Unnamed: 0,Restaurant,Review Score,Review Text,Review Text Eng
0,Luovuus kukkii kaaoksesta,6.0,The place was captivating: the flavors were de...,The place was captivating: the flavors were de...
1,Luovuus kukkii kaaoksesta,6.0,Luovuus Kukkii Kaaoksesta is a perfect mix of ...,Creativity blooms from chaos is a perfect mix ...
2,Luovuus kukkii kaaoksesta,6.0,The restaurant has a great atmosphere and serv...,The restaurant has a great atmosphere and serv...
3,Luovuus kukkii kaaoksesta,6.0,Olimme ensimmäistä kertaa tässä ravintolassa j...,It was our first time in this restaurant and i...
4,Luovuus kukkii kaaoksesta,6.0,Kaupungin paras isänpäiväillallinen tänäkin vu...,"Best father's Day dinner in town this year, to..."
...,...,...,...,...
58770,m/s King – Royal Line,5.0,Parasta että risteily lähti vuosaaresta. Ruoka...,Best the cruise left Vuosaari. The food was go...
58771,m/s King – Royal Line,5.0,"Kiitos, ylitti reissu odotukset. Myös japanila...","Thank you, exceeded your trip expectations. My..."
58772,m/s King – Royal Line,5.0,Ihana kesäinen menu! Todella maukasta kalaa ja...,Lovely summer menu! Really tasty fish and the ...
58773,m/s King – Royal Line,5.0,Hyvä ja runsas buffet pöytä; maisemat oli hyvä...,A good and hearty buffet table; the scenery wa...


In [4]:
reviews.to_csv("translated_review_data.csv", sep="\t")

In [5]:
menus = pd.read_csv("menu_highlights.csv", sep="\t")
menus.ffill(inplace=True)
menus['Menu'] = menus['Menu'].apply(lambda x: x.replace('\n', ' '))
menus['Menu Eng'] = translate_list(menus['Menu'].values)
menus

Unnamed: 0,Restaurant,Food restrictions,Menu,Menu Eng
0,Luovuus kukkii kaaoksesta,"['Gluten-free', 'Vegan', 'Vegetarian']",Mains | Päivän menu : Valitse 4 annosta listal...,Mains / menu of the day: choose 4 servings fro...
1,Ravintola MyStuu,"['Gluten-free', 'Vegan', 'Vegetarian']",Starters | Pieni lämmin Gruyère -leipä,Starters / small warm Gruyère bread
2,Ravintola MyStuu,"['Gluten-free', 'Vegan', 'Vegetarian']",Starters | Makumatka Alpeille : sveitsiläiset ...,Starters | tasting trip to the Alps: Swiss che...
3,Ravintola MyStuu,"['Gluten-free', 'Vegan', 'Vegetarian']","Starters | Pekoni Flammkuchen : creme fraiche,...","Starters | Bacon Flammkuchen: creme fraiche, b..."
4,Ravintola MyStuu,"['Gluten-free', 'Vegan', 'Vegetarian']",Fondue | Juustofondue : Autenttinen sveitsiläi...,Fondue / cheese fondue: authentic Swiss cheese...
...,...,...,...,...
1777,Mad Finn Brewing Co. Taproom Helsinki,"['Gluten-free', 'Vegetarian']",Pizza | Cheezy Pepperoni : FIN Valkoinen juust...,Pizza | Cheezy Pepperoni: FIN white cheese sau...
1778,Kahvila Mutteri,"['Vegan', 'Vegetarian']",Mains | Mutterin Wine & Paint : Kuukausittaine...,Mains / nut Wine & Paint : a monthly painting ...
1779,Kahvila Mutteri,"['Vegan', 'Vegetarian']",Wine | Mutterin Wine Tastings : Mutterin viini...,Wine / nut Wine Tastings: nut wine tastings ar...
1780,m/s King – Royal Line,"['Gluten-free', 'Vegan', 'Vegetarian']",Chef's Recommendations | Noutopöytä : Aikuinen...,Chef's Recommendations / Buffet: adult 30€chil...


In [6]:
menus.to_csv("translated_menu_data.csv", sep="\t")

## Data enrichment

In [None]:
data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)
addresses = pd.read_csv("restaurant_addresses.csv", sep="\t")
images = pd.read_csv("restaurant_pictures.csv", sep="\t")

In [10]:
data['Address'] = data['Name'].apply(lambda x: addresses[addresses.Restaurant == x].Address.values[0])

In [14]:
def get_first_img(restaurant):
    urls = images[images.Restaurant == restaurant].Images.values[0]
    if not isinstance(urls, str):
        return None
    return urls.split(',')[0]
data['Image URL'] = data['Name'].apply(get_first_img)

In [16]:
data['Review Count'] = data['Name'].apply(lambda x: reviews[reviews.Restaurant == x].shape[0])

In [17]:
data

Unnamed: 0,Name,Location,Cuisine,Meals,Price Level (out of 4),Rating (out of 6),Review Count,Page URL,Address,Image URL
0,Luovuus kukkii kaaoksesta,Kaartinkaupunki,International,Dinner,0,5.8,475,/en/place/luovuus-kukkii-kaaoksesta-90397/menu,"Pieni Roobertinkatu 13, 00130 Helsinki",https://qul.imgix.net/0685d1bd-cc11-4655-a99a-...
1,Ravintola MyStuu,Punavuori,Swiss,"Lunch, Dessert, Dinner",0,5.6,295,/en/place/ravintola-my-stuu-98898/menu,"Uudenmaankatu 13, 00120 Helsinki",https://qul.imgix.net/8e785a24-7574-4ca0-9bdc-...
2,Gaucho,City Centre,Brazilian,Dinner,0,5.6,1135,/en/place/gaucho-105125/menu,"Aleksanterinkatu 9, 00100 Helsinki",https://qul.imgix.net/2a300fb4-e74e-4d52-b994-...
3,Finlandia Caviar,City Centre,Gourmet,"Lunch, Dinner",0,5.6,144,/en/place/finlandia-caviar-15896/menu,"Eteläranta 20, 00130 Helsinki",https://qul.imgix.net/d0db7c0a-9447-4191-a835-...
4,Restaurant Armenian House,Kamppi,International,"Lunch, Dinner",0,5.6,318,/en/place/armenian-house-55148/menu,"Lönnrotinkatu 27, 00180 Helsinki",https://qul.imgix.net/1c51565b-266f-4f42-90bc-...
...,...,...,...,...,...,...,...,...,...,...
337,Black Sea Kitchen,Kaartinkaupunki,Georgian,"Lunch, Dinner",0,,0,/en/place/black-sea-kitchen-109077/about,"Eteläinen Makasiinikatu 4, 00130 Helsinki",https://qul.imgix.net/7ff8b6cf-ecff-46b2-8c60-...
338,Mad Finn Brewing Co. Taproom Helsinki,Sompasaari,Pizza,Dinner,0,6.0,0,/en/place/mad-finn-brewing-co-taproom-helsinki...,"Sompasaarenlaituri 12, 00540 Helsinki",https://qul.imgix.net/ed95a73a-4256-4b25-b2e7-...
339,Kahvila Mutteri,Lauttasaari,Dessert,Cake & Coffee,0,,0,/en/place/kahvila-mutteri-100773/menu,"Lauttasaarentie 2, 00200 Helsinki",https://qul.imgix.net/2df94a67-0291-4182-aa7f-...
340,Merisali - Hilton Kalastajatorppa,Munkkiniemi,Scandinavian,"Buffet, Dinner",0,5.0,0,/en/place/merisali-hilton-kalastajatorppa-9200...,"Kalastajatorpantie 1, 00330 Helsinki",https://qul.imgix.net/9ff5b034-a390-4bb9-8388-...


In [18]:
data.to_csv("restaurant_data.csv", sep="\t")

#### General allergy score

In [50]:
import pandas as pd
import data_processing as dp

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nesterenkojul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nesterenkojul/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1522.70it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [51]:
data, review_dict, menu_dict, embed_review_dict, embed_menu_dict = dp.initialise_index()

In [52]:
rest_allergy_scores = {}
for key in review_dict.keys():
    score = dp.general_allergy_score(review_dict[key], embed_review_dict[key])
    rest_allergy_scores[key] = score

In [None]:
data['General Allergy Score'] = data.Name.apply(lambda x: rest_allergy_scores[x])
data.to_csv("data/restaurant_data.csv", sep="\t")

## Embedding storage

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import json

model = SentenceTransformer('all-MiniLM-L6-v2')

data = pd.read_csv("data/restaurant_data.csv", sep="\t", index_col=0)
reviews = pd.read_csv("data/translated_review_data.csv", sep="\t", index_col=0, lineterminator='\n')
menus = pd.read_csv("data/translated_menu_data.csv", sep="\t", index_col=0)
review_dict, menu_dict, embed_review_dict, embed_menu_dict = {}, {}, {}, {}
for _, row in data.iterrows():
    rest_reviews = reviews[reviews.Restaurant == row.Name]["Review Text Eng"].values
    review_dict[row.Name] = rest_reviews
    embed_review_dict[row.Name] = model.encode(rest_reviews)
    rest_menu = menus[menus.Restaurant == row.Name]["Menu Eng"].values
    menu_dict[row.Name] = rest_menu
    embed_menu_dict[row.Name] = model.encode(rest_menu)

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1928.16it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [17]:
# Writing to file
def dict_to_json_file(d, path):
    for key, val in d.items():
        d[key] = val.tolist()
    with open(path, 'w', encoding='utf-8') as f:
       json.dump(d, f, separators=(',', ':'), indent=4)

#dict_to_json_file(embed_review_dict, 'data/embed_review_dict.txt')
#dict_to_json_file(embed_menu_dict, 'data/embed_menu_dict.json')


def save_review_embeds(d):
    for key, val in d.items():
        filename = key.replace("/", "_")
        with open(f'data/embed_review_dict/{filename}.txt', 'w', encoding='utf-8') as f:
            f.write(str(val))

save_review_embeds(embed_review_dict)

In [11]:
# Reading from file
def json_file_to_dict(path):
    with open(path, 'r', encoding='utf-8') as f:
        contents = f.read()
        d = json.loads(contents)
    for key, val in d.items():
        d[key] = np.asarray(val)
    return d

embed_review_dict_readed = json_file_to_dict('data/embed_review_dict.json')
embed_menu_dict_readed = json_file_to_dict('data/embed_menu_dict.json')

len(embed_review_dict_readed), len(embed_menu_dict_readed)

(342, 342)

In [None]:
from os import listdir

def load_review_embeds():
    dict_readed = {}
    for filename in listdir("data/embed_review_dict"):
        with open("data/embed_review_dict/" + filename, 'r', encoding='utf-8') as f:
            contents = f.read()
            val = json.loads(contents)
        key = filename.replace(".txt", "").replace("_", "/")
        dict_readed[key] = np.asarray(val)
    return dict_readed

{'Ristorante Il Trio': array([[-0.06644398,  0.13538219,  0.00315978, ...,  0.08065995,
         -0.09039234, -0.05017299],
        [-0.08968529,  0.08835264,  0.03601323, ...,  0.03945779,
         -0.13139504, -0.00681376],
        [-0.00179293,  0.04993234,  0.03869157, ...,  0.05293408,
         -0.09259934, -0.02182556],
        ...,
        [ 0.04042617,  0.04200339,  0.10376553, ...,  0.01063292,
         -0.09435593,  0.00486458],
        [-0.05547674, -0.05776194,  0.05227692, ..., -0.01472125,
         -0.08331089, -0.02679426],
        [ 0.00217182,  0.03239612,  0.01968956, ..., -0.01252618,
         -0.19028094,  0.02764027]], shape=(379, 384)),
 'Luovuus kukkii kaaoksesta': array([[-0.01511008, -0.00349099,  0.04976156, ...,  0.04222299,
         -0.10892852, -0.0237784 ],
        [ 0.00027754,  0.01244679,  0.0293059 , ..., -0.004373  ,
         -0.02271104, -0.02784893],
        [ 0.01105082,  0.05791782,  0.02364988, ...,  0.05526556,
         -0.11717245, -0.04293149]

## Dataviz tests (to be deleted)

In [38]:
import math
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.layouts import row
from bokeh.embed import components
from bokeh.resources import CDN

data = pd.read_csv("restaurant_data.csv", sep="\t", index_col=0)

In [62]:
def plot_stats(data):
    def recalculate_gas(score):
            threshold = 0.25
            if score == 'Neutral':
                return 0
            score = float(score)
            if score <= threshold:
                return -2 * (score + threshold) - 0.5
            return (score - threshold) * 2  - 0.5
    ratings = data['Rating (out of 6)'].apply(lambda x: 0 if math.isnan(x) else x).values
    review_counts = data['Review Count'].values
    allergy_scores = data['General Allergy Score'].apply(recalculate_gas).values
    counts, bin_edges = np.histogram(allergy_scores, bins=3, range=(-1, 1))
    bins = []
    bin_names = ['Risky', 'Neutral', 'Safe']
    for i in range(3):
        bins.append(f"{round(bin_edges[i], 1)} – {round(bin_edges[i + 1], 1)}\n{bin_names[i]}")

    p1 = figure(height=300, title="Rating VS Number of reviews ",
           toolbar_location=None)
    p2 = figure(x_range=bins, height=300, title="Distribution of General Allergy Scores",
           toolbar_location=None)
    
    p1.scatter(x=ratings, y=review_counts, color="#6CAF61", size=10, alpha=0.3)
    p1.background_fill_alpha = 0
    p1.y_range.start = 0
    p1.sizing_mode = "stretch_width"

    p2.vbar(x=bins, top=counts, width=0.9, color="#6CAF61", alpha=0.3)
    p2.xgrid.grid_line_color = None
    p2.background_fill_alpha = 0
    p2.y_range.start = 0
    p2.sizing_mode = "stretch_width"

    viz_row = row(p1, p2, sizing_mode="stretch_width")
    show(viz_row)
    script, div = components(viz_row)
    print(div)
    #return script, div, CDN.render()

plot_stats(data=data)

<div id="fe8f886c-dc03-4d82-9a71-3879b1f84e9e" data-root-id="p2935" style="display: contents;"></div>
