In [1]:
import json

from time import sleep
from random import randint

import pandas as pd
import numpy as np

# Our generated code
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%load_ext autoreload
%autoreload 2

In [2]:
from web_crawler.scraper_migros import MigrosScraper
from web_crawler.scraper_cora import CoraScraper
from web_crawler.scraper_coop import CoopScraper
from web_crawler.scraper_kaufland import KauflandScraper

# Scrape prices from web

In [None]:
products = pd.read_pickle("./data/products_pd.pickle")
products.dtypes

In [None]:
carbon_footprint_csv = "./data/carbon_footprint.csv"
products = pd.read_csv(carbon_footprint_csv,
                            delimiter=",")

In [None]:
products.Title = products.Title.fillna("")
# products.brands = products.brands.fillna("")

kaufland = KauflandScraper()
# migros = MigrosScraper()
# cora = CoraScraper()

counter = 0;

with open('./data/scraped_products_eaternity.json', mode='w', encoding='utf-8') as f:

    for i, row in products.iterrows():
        counter = counter +1
        query = row['Title']
        try:
            product_dict = kaufland.search(query)
            product_dict['code'] = str(row.ID)
            product_dict['product_name'] = row.Title
        except Exception as err:
            print(err)
            continue
            
        print(product_dict)

        f.write(json.dumps(product_dict)) # use `json.loads` to do the reverse
        sleep(randint(1,5))

            
        # We don't wanna get banned from the server
        sleep(randint(1,10))
            

# Cleanse data

In [3]:
from fuzzyset import FuzzySet

def compute_match_score(product_name_list, store_name_list):
    
    if product_name_list and store_name_list:
        name_fs = FuzzySet(store_name_list)
        name_score = [0 if name_fs.get(i)==None else name_fs.get(i)[0][0] for i in product_name_list]
        return sum(name_score) / len(name_score)  
    else:
        return 0

## Migros data

In [4]:
with open('./data/scraped_products_migros_carbon.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
    
migros_products = pd.DataFrame.from_dict(data)

# If it has no quantity, it is not a food item, so drop it
migros_products = migros_products[migros_products['store_quantity'] != ""]

print(migros_products.dtypes)

code                object
product_name        object
store_categories    object
store_currency      object
store_name          object
store_price         object
store_quantity      object
dtype: object


In [5]:
# Make prices numeric (2 price formats)
migros_products.store_price = pd.to_numeric(migros_products['store_price']\
                                            .apply(lambda x: x.split('.–')[0]),
                                            errors="raise")

Here comes the dirty part of the Migros scraped data. We need to know the quantity of the product, to be able to scale the price, however, it is embedded in text. I try to make clear what is to be done with my code comments, but I guess it would be helpful if you take a look at the data (above) first.

In [6]:
# Make the quantity of items sold numeric
migros_products['amount'] = pd.to_numeric(migros_products['store_quantity']\
                                            .apply(lambda x: x.split('x')[0]),
                                            errors="coerce").fillna(1.0)

In [7]:
migros_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,amount
1,5583,Yogurt Mandarine,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Joghurt Mandarine,0.55,180g,1.0
2,5585,Yogourt Rhubarbe/Vanille,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Larry's Strawberry-Rhubarb Pastilles,3.3,2 x 23g,2.0
3,5663,Ice Coffee,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Ice Coffee,1.95,165ml,1.0
4,43662,Risotto S. Andrea M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Risotto S.Andrea,2.55,1kg,1.0
5,43663,Riz Carolina Parboiled M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Carolina Trockenreis,2.25,1kg,1.0
6,43674,Riz BASMATI,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Subito Riz Casimir,2.9,170g,1.0
7,43679,Tortelloni ricotta e spinaci,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Tortelloni ricotta e spinaci,3.7,250g,1.0
8,43703,Yogourt fraise BIO,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Bio Joghurt Brombeere-Vanille,0.9,150g,1.0
9,43710,Yogourt Chocolat ferme M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Joghurt Kokos/Schokolade,0.55,180g,1.0
10,43711,Yogourt Mocca ferme M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Joghurt Heidelbeer,0.55,180g,1.0


In [8]:
# Extract the quantity per item

# Remove space between number and g or ml unit
migros_products['store_quantity'] = migros_products['store_quantity'].apply(lambda x: "".join(x.split()))

# Extract quantity per item. The number is bound between 1 and 1000, since the unit measure are metric.
migros_products['item_quantity'] = pd.to_numeric(migros_products['store_quantity'].str.extract(r'([0-9]{1,3})[gmlk]', expand=False), errors='coerce')

In [9]:
def compute_migros_price_per_100(row):
        
    quantity = row.amount * row.item_quantity
    if row.store_unit == 'g' or row.store_unit == 'm':
        # Unit in gramm / milliliter -> correct unit
        quantity *= 1        
    elif row.store_unit == 'k' or row.store_unit == 'l':

        # Unit in kilogramm / liter -> convert unit
        quantity *= 1000

    # Price as multiples of 100 gramms / milliliters
    return row.store_price / quantity * 100

In [10]:
migros_products['store_unit'] = migros_products['store_quantity'].str.extract(r'[\d]([gmlk])')

In [11]:

migros_products['price_per_100g'] = migros_products.apply(compute_migros_price_per_100, axis=1)

The webcrawl was automated, so oftentimes we will have selected unrelated items with our search queries. In the following, we quantify our confidence into a item referring to the product in the Open Food Facts database. Currently, we are doing this only by comparing the name strings. We would be safer, if we could also compare the categories. However, they are oftentimes in German/French, while the database is in English and the google translate python API has a bug, and the pull request is still pending (https://github.com/ssut/py-googletrans/pull/78). We are going to improve our matching method, once this issue is fixed.

In [12]:
migros_products['match_scores'] \
    = migros_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                            row.store_name.lower().split()), 
                            axis=1)

In [13]:
migros_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,amount,item_quantity,store_unit,price_per_100,match_scores
1,5583,Yogurt Mandarine,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Joghurt Mandarine,0.55,180g,1.0,180,g,0.305556,0.857143
2,5585,Yogourt Rhubarbe/Vanille,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Larry's Strawberry-Rhubarb Pastilles,3.3,2x23g,2.0,23,g,7.173913,0.125
3,5663,Ice Coffee,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Ice Coffee,1.95,165ml,1.0,165,m,1.181818,1.0
4,43662,Risotto S. Andrea M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Risotto S.Andrea,2.55,1kg,1.0,1,k,0.255,0.75
5,43663,Riz Carolina Parboiled M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Carolina Trockenreis,2.25,1kg,1.0,1,k,0.225,0.583333
6,43674,Riz BASMATI,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Subito Riz Casimir,2.9,170g,1.0,170,g,1.705882,0.642857
7,43679,Tortelloni ricotta e spinaci,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Tortelloni ricotta e spinaci,3.7,250g,1.0,250,g,1.48,1.0
8,43703,Yogourt fraise BIO,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,Bio Joghurt Brombeere-Vanille,0.9,150g,1.0,150,g,0.6,0.64986
9,43710,Yogourt Chocolat ferme M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Joghurt Kokos/Schokolade,0.55,180g,1.0,180,g,0.305556,0.537946
10,43711,Yogourt Mocca ferme M-Classic,"[Startseite, Sortiment, Supermarkt, Lebensmitt...",CHF,M-Classic Joghurt Heidelbeer,0.55,180g,1.0,180,g,0.305556,0.561905


## Monoprix data

In [14]:
with open('./data/scraped_products_monoprix.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
monoprix_products = pd.DataFrame.from_dict(data)

monoprix_products = monoprix_products[monoprix_products['store_currency']=='EURO']

In [15]:
# Make price numeric
monoprix_products.store_price = monoprix_products.store_price.apply(lambda x: float(x[0]))

In [16]:
def monoprix_price_per_100(store_quantity_str):
    tokens = str(store_quantity_str).split()
    if "litre" in tokens or "kg" in tokens:
        return float(tokens[-2])/100
    else:
        return np.nan

In [17]:
# Compute price per 100gr/ml
monoprix_products['price_per_100g'] = monoprix_products.store_quantity.apply(monoprix_price_per_100)

In [18]:
# Compute confidence that the scraped item is the same as in the open food facts database
monoprix_products['match_scores'] = monoprix_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                               row.store_name.lower().split('-')), 
                                                           axis=1)

In [19]:
monoprix_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,price_per_100,match_scores
0,0013935620236,Boisson au soja,"[Produits frais, Le végétal]",EURO,bjorg-boisson-soja-nature-1l,1.43,Prix au litre : 1.43 €,0.0143,0.666667
1,0014113912853,Pistaches XXL grillées salées,"[Epicerie salée, Diététique]",EURO,biothentic-pistaches-grillees-salees-biologiqu...,5.58,Prix au kg : 37.20 €,0.3720,0.677083
4,0041500007229,Classic Yellow,"[Epicerie salée, Condiments, Huile et Vinaigre]",EURO,heinz-yellow-mustard-classic-445g,2.56,Prix au kg : 5.75 €,0.0575,1.000000
5,0072417146831,Z'animo,"[Loisirs, Jouets]",EURO,playmobil-dj-z,3.99,,,0.222222
6,0074603005212,Noodle Soup,"[Epicerie salée, Plats cuisinés]",EURO,lustrucu-noodle-boeuf-sachet-83g,0.93,Prix au kg : 11.20 €,0.1120,0.583333
8,0205543009934,Haché Porc et Boeuf,"[Le marché, Boucherie, Volaille]",EURO,boulette-provencales-boeuf-porc-x15-375g-bigard,4.15,Prix au kg : 11.07 €,0.1107,0.562500
9,0215579090462,Dinde de Fête,"[Loisirs, Jouets]",EURO,robe-de-princesse-5-7-ans,16.99,,,0.550000
10,0217107083373,Canette fermière de Loué,"[Produits frais, Lait et oeufs]",EURO,loue-6-oeufs-fermiers-de-poules-elevees-en-ple...,3.12,Prix à l'unité : 0.52 €,,0.660714
11,0217463045466,Cerf,"[Epicerie salée, Pâtés, Terrines et Foie gras]",EURO,cora-terrine-de-cerf-au-cognac-180g,1.27,Prix au kg : 7.06 €,0.0706,1.000000
12,0217994094988,Poularde,"[Epicerie sucrée, Biscuits, Pâtisserie]",EURO,mere-poulard-palets-du-mont-saint-michel-125g,1.27,,,0.875000


## Kaufland products

In [20]:
with open('./data/scraped_products_eaternity.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
kaufland_products = pd.DataFrame.from_dict(data)

In [21]:
# Drop all products that do not have a quantity, since in these are no foods
kaufland_products = kaufland_products[kaufland_products['store_quantity'] != ""]


In [22]:
# Make price numeric
kaufland_products.store_price = kaufland_products.store_price.apply(lambda x: float(x[0]))

In [23]:
def compute_kaufland_price_per_100(row):
        
    tokens = row['store_quantity'].split()
    if len(tokens) > 4 and tokens[2] == '=':
        price = float(tokens[3].replace(',','.'))
        if tokens[1] == 'g' or tokens[1] == 'ML':
            quantity = float(tokens[0]) / 100
            price_per_100 = price / quantity

        elif tokens[1] == 'KG' or tokens[1] == 'L':
            quantity = float(tokens[0]) * 10 # *(1000 / 100)
            price_per_100 = price / quantity
            
        else:
            price_per_100 = np.nan

        
    else:
        price_per_100 = np.nan
                
    return price_per_100

In [24]:
# Compute price per 100g
kaufland_products['price_per_100g'] = kaufland_products.apply(compute_kaufland_price_per_100, axis=1)

In [25]:
# Remove quantity specifier from store name
kaufland_products['store_name'] = kaufland_products.store_name.apply(lambda x: " ".join(x.split(" ")[0:-1]))

In [26]:
# Compute confidence that the scraped item is the same as in the open food facts database
kaufland_products['match_scores'] = kaufland_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                               row.store_name.lower().split()), 
                                                           axis=1)

In [27]:
# Prefilter products by score
kaufland_products = kaufland_products[kaufland_products['match_scores']>0.1].dropna()

In [28]:
kaufland_products

Unnamed: 0,code,product_name,store_categories,store_currency,store_name,store_price,store_quantity,price_per_100,match_scores
0,4300175162708,K Classic - Junger Gemüsemais,"[Konserven, &, Feinkost]",EURO,Junger Gemüsemais,0.0,"1 KG = 1,72 €",0.172,0.400000
1,4388840231829,ja! Gemüsemais,"[Konserven, &, Feinkost]",EURO,Gemüsemais,1.0,"100 g = 0,57 €",0.570,0.500000
2,4003994111000,Kelloggs Cornflakes Die Originalen 375 g,[Frühstück],EURO,Cornflakes,2.0,"1 KG = 6,64 €",0.664,0.250000
3,4005009100542,Tortilla Chips Meersalz,"[Süßes, &, Salziges]",EURO,Tortilla Chips Tex Barbecue,0.0,"1 KG = 2,97 €",0.297,0.666667
4,4009790005628,Meienburg Sonnenblumenkerne,"[Obst, &, Gemüse]",EURO,Sonnenblumenkerne,1.0,"1 KG = 2,76 €",0.276,0.647059
5,7610169013310,Migros Engagement Bio Pumpernickel,[Getränke],EURO,Erfrischungsgetränk Holunder-Cranberry,0.0,"1 L = 0,98 € zzgl. 0.25 Pfand",0.098,0.134503
6,4305399028052,Naturkind - Pumpernickel,[Frühstück],EURO,Kirsch-Pumpernickel-Schnitte,2.0,"1 KG = 6,92 €",0.692,0.190476
7,8690777653008,Sera Ajvar,"[Konserven, &, Feinkost]",EURO,Ajvar mild,2.0,"100 g = 1,15 €",1.150,0.500000
8,4009249020240,Lieken Urkorn – Kleines Kerni mit Sonnenblumen...,[Frühstück],EURO,Beeren Urkorn Müsli,4.0,"1 KG = 15,24 €",1.524,0.338095
9,4071800000824,Harry - Das volle Korn,[Frühstück],EURO,Bio Volles Korn Brot,1.0,"1 KG = 3,18 €",0.318,0.400000


# Export prices

In [29]:
prices = pd.concat([migros_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']], 
                    monoprix_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']],
                    kaufland_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']]
                   ])

In [30]:
prices

Unnamed: 0,code,product_name,store_currency,price_per_100,match_scores
1,5583,Yogurt Mandarine,CHF,0.305556,0.857143
2,5585,Yogourt Rhubarbe/Vanille,CHF,7.173913,0.125000
3,5663,Ice Coffee,CHF,1.181818,1.000000
4,43662,Risotto S. Andrea M-Classic,CHF,0.255000,0.750000
5,43663,Riz Carolina Parboiled M-Classic,CHF,0.225000,0.583333
6,43674,Riz BASMATI,CHF,1.705882,0.642857
7,43679,Tortelloni ricotta e spinaci,CHF,1.480000,1.000000
8,43703,Yogourt fraise BIO,CHF,0.600000,0.649860
9,43710,Yogourt Chocolat ferme M-Classic,CHF,0.305556,0.537946
10,43711,Yogourt Mocca ferme M-Classic,CHF,0.305556,0.561905


In [31]:
# Only export items with a credible match score (threhsold found from trial and error)
prices_filtered = prices[prices['match_scores']>0.0]
prices_filtered.dtypes

code               object
product_name       object
store_currency     object
price_per_100     float64
match_scores      float64
dtype: object

In [32]:
file_name = './data/prices.csv'
prices_filtered.set_index('code').to_csv(file_name)

# Testbed for new website

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup

In [None]:
BASE_URL = "https://www.kaufland.de/suche.assortmentSearch.html?q="

driver = webdriver.Firefox(executable_path='/home/kingkolibri/Programs/geckodriver')
driver.implicitly_wait(2)

In [None]:
query = "Sahne"

driver.get(BASE_URL + query )




In [None]:
# Find object on search results page
python_button = driver.find_element_by_css_selector(
    'div.t-search-result__list-item:nth-child(1) > div:nth-child(1) > a:nth-child(1)')  # First article in grid view
try:
    python_button.click()  # click link
except Exception:
    print(Exception.with_traceback)
    # Accept the cookies
    python_button = driver.find_element_by_css_selector(
    '.a-button--outlined > button:nth-child(1)')  # First article in grid view
    python_button.click()  # click link
    
    # Retry element
    python_button = driver.find_element_by_css_selector(
    'div.t-search-result__list-item:nth-child(1) > div:nth-child(1) > a:nth-child(1)')  # First article in grid view
    python_button.click()  # click link

    
# Hand the page source to Beautiful Soup
soup = BeautifulSoup(driver.page_source, 'lxml')

In [None]:
# Product name
''.join(soup.select_one('.t-assortment-detail__title').text).strip()

In [None]:
# Store price
soup.select_one('.a-pricetag__price').text

In [None]:
# Quantity
''.join(soup.select_one('.t-assortment-detail__basic-price').text).strip()

In [None]:
# Category in store
soup.select_one('.m-offer-categories__title').text