In [None]:
import json

from time import sleep
from random import randint

import pandas as pd
import numpy as np

# Our generated code
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%load_ext autoreload
%autoreload 2

In [None]:
from web_crawler.scraper_migros import MigrosScraper
from web_crawler.scraper_cora import CoraScraper
from web_crawler.scraper_coop import CoopScraper
from web_crawler.scraper_kaufland import KauflandScraper

# Scrape prices from web

In [None]:
products = pd.read_pickle("./data/products_pd.pickle")
products.dtypes

In [None]:
carbon_footprint_csv = "./data/carbon_footprint.csv"
products = pd.read_csv(carbon_footprint_csv,
                            delimiter=",")

In [None]:
products.Title = products.Title.fillna("")
# products.brands = products.brands.fillna("")

kaufland = KauflandScraper()
# migros = MigrosScraper()
# cora = CoraScraper()

counter = 0;

with open('./data/scraped_products_eaternity.json', mode='w', encoding='utf-8') as f:

    for i, row in products.iterrows():
        counter = counter +1
        query = row['Title']
        try:
            product_dict = kaufland.search(query)
            product_dict['code'] = str(row.ID)
            product_dict['product_name'] = row.Title
        except Exception as err:
            print(err)
            continue
            
        print(product_dict)

        f.write(json.dumps(product_dict)) # use `json.loads` to do the reverse
        sleep(randint(1,5))

            
        # We don't wanna get banned from the server
        sleep(randint(1,10))
            

# Cleanse data

In [None]:
from fuzzyset import FuzzySet

def compute_match_score(product_name_list, store_name_list):
    
    if product_name_list and store_name_list:
        name_fs = FuzzySet(store_name_list)
        name_score = [0 if name_fs.get(i)==None else name_fs.get(i)[0][0] for i in product_name_list]
        return sum(name_score) / len(name_score)  
    else:
        return 0

## Migros data

In [None]:
with open('./data/scraped_products_migros_carbon.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
    
migros_products = pd.DataFrame.from_dict(data)

# If it has no quantity, it is not a food item, so drop it
migros_products = migros_products[migros_products['store_quantity'] != ""]

print(migros_products.dtypes)

In [None]:
# Make prices numeric (2 price formats)
migros_products.store_price = pd.to_numeric(migros_products['store_price']\
                                            .apply(lambda x: x.split('.–')[0]),
                                            errors="raise")

Here comes the dirty part of the Migros scraped data. We need to know the quantity of the product, to be able to scale the price, however, it is embedded in text. I try to make clear what is to be done with my code comments, but I guess it would be helpful if you take a look at the data (above) first.

In [None]:
# Make the quantity of items sold numeric
migros_products['amount'] = pd.to_numeric(migros_products['store_quantity']\
                                            .apply(lambda x: x.split('x')[0]),
                                            errors="coerce").fillna(1.0)

In [None]:
migros_products

In [None]:
# Extract the quantity per item

# Remove space between number and g or ml unit
migros_products['store_quantity'] = migros_products['store_quantity'].apply(lambda x: "".join(x.split()))

# Extract quantity per item. The number is bound between 1 and 1000, since the unit measure are metric.
migros_products['item_quantity'] = pd.to_numeric(migros_products['store_quantity'].str.extract(r'([0-9]{1,3})[gmlk]', expand=False), errors='coerce')

In [None]:
def compute_migros_price_per_100(row):
        
    quantity = row.amount * row.item_quantity
    if row.store_unit == 'g' or row.store_unit == 'm':
        # Unit in gramm / milliliter -> correct unit
        quantity *= 1        
    elif row.store_unit == 'k' or row.store_unit == 'l':

        # Unit in kilogramm / liter -> convert unit
        quantity *= 1000

    # Price as multiples of 100 gramms / milliliters
    return row.store_price / quantity * 100

In [None]:
migros_products['store_unit'] = migros_products['store_quantity'].str.extract(r'[\d]([gmlk])')

In [None]:

migros_products['price_per_100'] = migros_products.apply(compute_migros_price_per_100, axis=1)

The webcrawl was automated, so oftentimes we will have selected unrelated items with our search queries. In the following, we quantify our confidence into a item referring to the product in the Open Food Facts database. Currently, we are doing this only by comparing the name strings. We would be safer, if we could also compare the categories. However, they are oftentimes in German/French, while the database is in English and the google translate python API has a bug, and the pull request is still pending (https://github.com/ssut/py-googletrans/pull/78). We are going to improve our matching method, once this issue is fixed.

In [None]:
migros_products['match_scores'] \
    = migros_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                            row.store_name.lower().split()), 
                            axis=1)

In [None]:
migros_products

## Monoprix data

In [None]:
with open('./data/scraped_products_monoprix.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
monoprix_products = pd.DataFrame.from_dict(data)

monoprix_products = monoprix_products[monoprix_products['store_currency']=='EURO']

In [None]:
# Make price numeric
monoprix_products.store_price = monoprix_products.store_price.apply(lambda x: float(x[0]))

In [None]:
def monoprix_price_per_100(store_quantity_str):
    tokens = str(store_quantity_str).split()
    if "litre" in tokens or "kg" in tokens:
        return float(tokens[-2])/100
    else:
        return np.nan

In [None]:
# Compute price per 100gr/ml
monoprix_products['price_per_100'] = monoprix_products.store_quantity.apply(monoprix_price_per_100)

In [None]:
# Compute confidence that the scraped item is the same as in the open food facts database
monoprix_products['match_scores'] = monoprix_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                               row.store_name.lower().split('-')), 
                                                           axis=1)

In [None]:
monoprix_products

## Kaufland products

In [None]:
with open('./data/scraped_products_eaternity.json', mode='r', encoding='utf-8') as f:
    data = json.load(f)
    
kaufland_products = pd.DataFrame.from_dict(data)

In [None]:
# Drop all products that do not have a quantity, since in these are no foods
kaufland_products = kaufland_products[kaufland_products['store_quantity'] != ""]


In [None]:
# Make price numeric
kaufland_products.store_price = kaufland_products.store_price.apply(lambda x: float(x[0]))

In [None]:
def compute_kaufland_price_per_100(row):
        
    tokens = row['store_quantity'].split()
    if len(tokens) > 4 and tokens[2] == '=':
        price = float(tokens[3].replace(',','.'))
        if tokens[1] == 'g' or tokens[1] == 'ML':
            quantity = float(tokens[0]) / 100
            price_per_100 = price / quantity

        elif tokens[1] == 'KG' or tokens[1] == 'L':
            quantity = float(tokens[0]) * 10 # *(1000 / 100)
            price_per_100 = price / quantity
            
        else:
            price_per_100 = np.nan

        
    else:
        price_per_100 = np.nan
                
    return price_per_100

In [None]:
# Compute price per 100g
kaufland_products['price_per_100'] = kaufland_products.apply(compute_kaufland_price_per_100, axis=1)

In [None]:
# Remove quantity specifier from store name
kaufland_products['store_name'] = kaufland_products.store_name.apply(lambda x: " ".join(x.split(" ")[0:-1]))

In [None]:
# Compute confidence that the scraped item is the same as in the open food facts database
kaufland_products['match_scores'] = kaufland_products.apply(lambda row: compute_match_score(row.product_name.lower().split(),
                                                               row.store_name.lower().split()), 
                                                           axis=1)

In [None]:
# Prefilter products by score
kaufland_products = kaufland_products[kaufland_products['match_scores']>0.1].dropna()

In [None]:
kaufland_products

# Export prices

In [None]:
prices = pd.concat([migros_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']], 
                    monoprix_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']],
                    kaufland_products[['code', 'product_name', 'store_currency', 'price_per_100', 'match_scores']]
                   ])

In [None]:
# Only export items with a credible match score (threhsold found from trial and error)
prices_filtered = prices[prices['match_scores']>0.0]
prices_filtered.dtypes

In [None]:
file_name = './data/prices.csv'
prices_filtered.set_index('code').to_csv(file_name)

# Testbed for new website

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup

In [None]:
BASE_URL = "https://www.kaufland.de/suche.assortmentSearch.html?q="

driver = webdriver.Firefox(executable_path='/home/kingkolibri/Programs/geckodriver')
driver.implicitly_wait(2)

In [None]:
query = "Sahne"

driver.get(BASE_URL + query )




In [None]:
# Find object on search results page
python_button = driver.find_element_by_css_selector(
    'div.t-search-result__list-item:nth-child(1) > div:nth-child(1) > a:nth-child(1)')  # First article in grid view
try:
    python_button.click()  # click link
except Exception:
    print(Exception.with_traceback)
    # Accept the cookies
    python_button = driver.find_element_by_css_selector(
    '.a-button--outlined > button:nth-child(1)')  # First article in grid view
    python_button.click()  # click link
    
    # Retry element
    python_button = driver.find_element_by_css_selector(
    'div.t-search-result__list-item:nth-child(1) > div:nth-child(1) > a:nth-child(1)')  # First article in grid view
    python_button.click()  # click link

    
# Hand the page source to Beautiful Soup
soup = BeautifulSoup(driver.page_source, 'lxml')

In [None]:
# Product name
''.join(soup.select_one('.t-assortment-detail__title').text).strip()

In [None]:
# Store price
soup.select_one('.a-pricetag__price').text

In [None]:
# Quantity
''.join(soup.select_one('.t-assortment-detail__basic-price').text).strip()

In [None]:
# Category in store
soup.select_one('.m-offer-categories__title').text