In [1]:
import json

from time import sleep
from random import randint

import pandas as pd
import numpy as np

# Our generated code
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%load_ext autoreload
%autoreload 2

In [2]:
from web_crawler.scraper_migros import MigrosScraper
from web_crawler.scraper_cora import CoraScraper
from web_crawler.scraper_coop import CoopScraper

# Scrape prices from web

In [None]:
products = pd.read_pickle("./data/products_pd.pickle")

In [None]:
products.loc[products["stores"].apply({"Migros"}.issubset)]

In [None]:
products.product_name = products.product_name.fillna("")
products.brands = products.brands.fillna("")

migros = MigrosScraper()
# cora = CoraScraper()

counter = 0;

with open('./data/scraped_products_migros.json', mode='w', encoding='utf-8') as f:

    for i, row in products.iterrows():
        if "Migros" in row.stores:
            counter = counter +1
            query = row['product_name']
            try:
                product_dict = migros.search(query)
                product_dict['code'] = row.name
                product_dict['product_name'] = row.product_name
            except Exception as err:
                print(err)
                continue
                
            f.write(json.dumps(product_dict)) # use `json.loads` to do the reverse
            sleep(randint(1,5))

            
        elif ("Cora" in row.stores) & (1==0): # skip for this run
            query = row['product_name']
            print(query.title())
            try:
                product_dict = cora.search(query)
                product_dict['code'] = row.name
                product_dict['product_name'] = row.product_name
            except Exception as err:
                print(err)
                continue
                
            f.write(json.dumps(product_dict) + ',') # use `json.loads` to do the reverse
            
            # We don't wanna get banned from the server
            sleep(randint(1,10))
            

# Cleanse data

In [34]:
from fuzzyset import FuzzySet

def compute_match_score(product_name_list, store_name_list):
    
    if product_name_list and store_name_list:
        name_fs = FuzzySet(store_name_list)
        name_score = [0 if name_fs.get(i)==None else name_fs.get(i)[0][0] for i in product_name_list]
        return sum(name_score) / len(name_score)  
    else:
        return 0

## Migros data

In [14]:
with open('./data/scraped_products_migros.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
    
migros_products = pd.DataFrame.from_dict(data)

# If it has no quantity, it is not a food item, so drop it
migros_products = migros_products[migros_products['store_quantity'] != ""]

print(migros_products.shape)

(837, 7)


In [15]:
# Make prices numeric (2 price formats)
migros_products.store_price = pd.to_numeric(migros_products['store_price']\
                                            .apply(lambda x: x.split('.–')[0]),
                                            errors="raise")

In [16]:
migros_products.store_price.iloc[421]

1.8

Here comes the dirty part of the Migros scraped data. We need to know the quantity of the product, to be able to scale the price, however, it is embedded in text. I try to make clear what is to be done with my code comments, but I guess it would be helpful if you take a look at the data (above) first.

In [17]:
# Make the quantity of items sold numeric
migros_products['amount'] = pd.to_numeric(migros_products['store_quantity']\
                                            .apply(lambda x: x.split('x')[0]),
                                            errors="coerce").fillna(1.0)

In [18]:
migros_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,amount
1,0039047001824,Chocolate Chip Shortbread,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Walkers Chocolate Chip Shortbread,5.00,175g,1.0
2,00840293,maizena,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Patissier Maisstärke,2.00,300g,1.0
3,08520223,thon rosé,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic MSC Weisser Thon,2.60,155g,1.0
4,10013117,Bifidus Drink,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Bifidus Probiotic Drink Pfirsich,6.40,8 x 100ml,8.0
5,12051018,Moutarde douce,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Zweifel Chips Moutarde,3.95,175g,1.0
6,2000000014607,Galettes de riz complet au chocolat au lait,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Petit Beurre Chocolat au Lait,4.20,4 x 50g,4.0
7,2000000029412,Petit Beurre,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Petit Beurre,1.45,230g,1.0
8,2000000031297,Fromage frais aux herbes,"[Startseite, Sortiment, Supermarkt, Beauty, Kö...",CHF,Herbs Kräuter Bad Vitalizing,5.50,400ml,1.0
9,2000000043478,Saray Baharatlı Pizza Kraker,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Budget Pizza Margherita,2.70,3 x 320g,3.0
10,2110103000000,Le Gruyère doux,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Heidi Le Gruyère mittelreif,2.25,100 g,1.0


In [29]:
# Extract the quantity per item

# Remove space between number and g or ml unit
migros_products['store_quantity'] = migros_products['store_quantity'].apply(lambda x: "".join(x.split()))

# Extract quantity per item. The number is bound between 1 and 1000, since the unit measure are metric.
migros_products['item_quantity'] = pd.to_numeric(migros_products['store_quantity'].str.extract(r'([0-9]{1,3})[gmlk]', expand=False), errors='coerce')

In [30]:
def compute_migros_price_per_100(row):
        
    quantity = row.amount * row.item_quantity
    if row.store_unit == 'g' or row.store_unit == 'm':
        # Unit in gramm / milliliter -> correct unit
        quantity *= 1        
    elif row.store_unit == 'k' or row.store_unit == 'l':

        # Unit in kilogramm / liter -> convert unit
        quantity *= 1000

    # Price as multiples of 100 gramms / milliliters
    return row.store_price / quantity * 100

In [31]:
migros_products['store_unit'] = migros_products['store_quantity'].str.extract(r'[\d]([gmlk])')

In [32]:

migros_products['price_per_100'] = migros_products.apply(compute_migros_price_per_100, axis=1)

The webcrawl was automated, so oftentimes we will have selected unrelated items with our search queries. In the following, we quantify our confidence into a item referring to the product in the Open Food Facts database. Currently, we are doing this only by comparing the name strings. We would be safer, if we could also compare the categories. However, they are oftentimes in German/French, while the database is in English and the google translate python API has a bug, and the pull request is still pending (https://github.com/ssut/py-googletrans/pull/78). We are going to improve our matching method, once this issue is fixed.

In [35]:
migros_products['match_scores'] \
    = migros_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                            row.store_name.lower().split()), 
                            axis=1)

In [36]:
migros_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,amount,item_quantity,store_unit,price_per_100,match_scores
1,0039047001824,Chocolate Chip Shortbread,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Walkers Chocolate Chip Shortbread,5.00,175g,1.0,175.0,g,2.857143,1.000000
2,00840293,maizena,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Patissier Maisstärke,2.00,300g,1.0,300.0,g,0.666667,0.300000
3,08520223,thon rosé,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic MSC Weisser Thon,2.60,155g,1.0,155.0,g,1.677419,0.500000
4,10013117,Bifidus Drink,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Bifidus Probiotic Drink Pfirsich,6.40,8x100ml,8.0,100.0,m,0.800000,1.000000
5,12051018,Moutarde douce,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Zweifel Chips Moutarde,3.95,175g,1.0,175.0,g,2.257143,0.687500
6,2000000014607,Galettes de riz complet au chocolat au lait,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Petit Beurre Chocolat au Lait,4.20,4x50g,4.0,50.0,g,2.100000,0.614583
7,2000000029412,Petit Beurre,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Petit Beurre,1.45,230g,1.0,230.0,g,0.630435,1.000000
8,2000000031297,Fromage frais aux herbes,"[Startseite, Sortiment, Supermarkt, Beauty, Kö...",CHF,Herbs Kräuter Bad Vitalizing,5.50,400ml,1.0,400.0,m,1.375000,0.258333
9,2000000043478,Saray Baharatlı Pizza Kraker,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Budget Pizza Margherita,2.70,3x320g,3.0,320.0,g,0.281250,0.450000
10,2110103000000,Le Gruyère doux,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Heidi Le Gruyère mittelreif,2.25,100g,1.0,100.0,g,2.250000,0.666667


## Monoprix data

In [37]:
with open('./data/scraped_products_monoprix.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
monoprix_products = pd.DataFrame.from_dict(data)

monoprix_products = monoprix_products[monoprix_products['store_currency']=='EURO']

In [38]:
# Make price numeric
monoprix_products.store_price = monoprix_products.store_price.apply(lambda x: float(x[0]))

In [39]:
def monoprix_price_per_100(store_quantity_str):
    tokens = str(store_quantity_str).split()
    if "litre" in tokens or "kg" in tokens:
        return float(tokens[-2])/100
    else:
        return np.nan

In [40]:
# Compute price per 100gr/ml
monoprix_products['price_per_100'] = monoprix_products.store_quantity.apply(monoprix_price_per_100)

In [41]:
# Compute confidence that the scraped item is the same as in the open food facts database
monoprix_products['match_scores'] = monoprix_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                               row.store_name.lower().split('-')), 
                                                           axis=1)

In [42]:
monoprix_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,price_per_100,match_scores
0,0013935620236,Boisson au soja,"[Produits frais, Le végétal]",EURO,bjorg-boisson-soja-nature-1l,1.43,Prix au litre : 1.43 €,0.0143,0.666667
1,0014113912853,Pistaches XXL grillées salées,"[Epicerie salée, Diététique]",EURO,biothentic-pistaches-grillees-salees-biologiqu...,5.58,Prix au kg : 37.20 €,0.3720,0.677083
4,0041500007229,Classic Yellow,"[Epicerie salée, Condiments, Huile et Vinaigre]",EURO,heinz-yellow-mustard-classic-445g,2.56,Prix au kg : 5.75 €,0.0575,1.000000
5,0072417146831,Z'animo,"[Loisirs, Jouets]",EURO,playmobil-dj-z,3.99,,,0.222222
6,0074603005212,Noodle Soup,"[Epicerie salée, Plats cuisinés]",EURO,lustrucu-noodle-boeuf-sachet-83g,0.93,Prix au kg : 11.20 €,0.1120,0.583333
8,0205543009934,Haché Porc et Boeuf,"[Le marché, Boucherie, Volaille]",EURO,boulette-provencales-boeuf-porc-x15-375g-bigard,4.15,Prix au kg : 11.07 €,0.1107,0.562500
9,0215579090462,Dinde de Fête,"[Loisirs, Jouets]",EURO,robe-de-princesse-5-7-ans,16.99,,,0.550000
10,0217107083373,Canette fermière de Loué,"[Produits frais, Lait et oeufs]",EURO,loue-6-oeufs-fermiers-de-poules-elevees-en-ple...,3.12,Prix à l'unité : 0.52 €,,0.660714
11,0217463045466,Cerf,"[Epicerie salée, Pâtés, Terrines et Foie gras]",EURO,cora-terrine-de-cerf-au-cognac-180g,1.27,Prix au kg : 7.06 €,0.0706,1.000000
12,0217994094988,Poularde,"[Epicerie sucrée, Biscuits, Pâtisserie]",EURO,mere-poulard-palets-du-mont-saint-michel-125g,1.27,,,0.875000


# Export prices

In [45]:
prices = migros_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']]


In [46]:
prices = pd.concat([prices, 
                    monoprix_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']]
                   ])

In [48]:
# Only export items with a credible match score (threhsold found from trial and error)
prices_filtered = prices[prices['match_scores']>0.55]
prices_filtered

Unnamed: 0,code,product_name,store_currency,price_per_100,match_scores
1,0039047001824,Chocolate Chip Shortbread,CHF,2.857143,1.000000
4,10013117,Bifidus Drink,CHF,0.800000,1.000000
5,12051018,Moutarde douce,CHF,2.257143,0.687500
6,2000000014607,Galettes de riz complet au chocolat au lait,CHF,2.100000,0.614583
7,2000000029412,Petit Beurre,CHF,0.630435,1.000000
10,2110103000000,Le Gruyère doux,CHF,2.250000,0.666667
11,2114296007556,Saucisson vaudois,CHF,1.800000,0.666667
12,2114413003201,delikatess fleischkase,CHF,1.304348,0.954545
14,2126744000002,Bünder Bergkäse,CHF,2.450000,0.687500
15,2130270006407,"Saucisse aux choux vaudoise,",CHF,1.400000,0.861111


In [50]:
file_name = './data/prices.csv'
prices_filtered.set_index('code').to_csv(file_name)

# Testbed for new website

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup

In [None]:
BASE_URL = "https://search.migros.ch/de/q:"

driver = webdriver.Firefox(executable_path='/home/kingkolibri/Programs/geckodriver')
driver.implicitly_wait(2)

In [None]:
query = "Chocolate Chip Shortbread"

driver.get(BASE_URL + query )




In [None]:
# Find object on search results page
python_button = driver.find_element_by_css_selector(
    '.mui-list-products-wide > li:nth-child(1) > a:nth-child(1)')  # First article in grid view
python_button.click()  # click link

# Hand the page source to Beautiful Soup
soup = BeautifulSoup(driver.page_source, 'lxml')

In [None]:
''.join(soup.select_one('.sidebar-product-name').text).strip()

In [None]:
soup.select_one('.current-price').text

In [None]:
''.join(soup.select_one('p.sidebar-subtext').text).strip()

In [None]:
soup.select_one('.mui-breadcrumb').text.split()

In [None]:
element = soup.select_one('#title')

In [None]:
'-'.join([i.text for i in soup.select_one('#title').findAll('span')[:-2]])