# Scraping Eroski

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

In [3]:
HEADER = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/90.0.4430.85 Safari/537.36"}

In [4]:
# List of products included in the analysis by url

eroski_url_dict = {101: f'https://supermercado.eroski.es/es/productdetail/23628514-tomate-ecologico-eroski-natur-bio-al-peso-compra-minima-500-g/',
                102: f'https://supermercado.eroski.es/es/productdetail/23628209-platano-de-canarias-eco-eroski-natur-bio-al-peso-compra-minima-1-kg/',
                103: f'https://supermercado.eroski.es/es/productdetail/23628084-manzana-roja-ecologica-eroski-natur-bio-al-peso-compra-minima-1-kg/',
                104: f'https://supermercado.eroski.es/es/productdetail/23628340-calabacin-ecologico-eroski-natur-bio-al-peso-compra-minima-500-g/', 
                105: f'https://supermercado.eroski.es/es/productdetail/23628548-zanahoria-ecologica-eroski-natur-bio-bandeja-600-g/',
                106: f'https://supermercado.eroski.es/es/productdetail/23628431-patata-ecologica-eroski-natur-bio-al-peso-compra-minima-1-kg/'}
                #501: f'https://supermercado.eroski.es/es/productdetail/24031197-chocolate-negro-74-costa-de-marfil-ethiquable-tableta-100-g/',
                #703: f'https://supermercado.eroski.es/es/productdetail/11735339-bebida-de-avena-yosoy-brik-1-litro/',
                #705: f'https://supermercado.eroski.es/es/productdetail/14854624-leche-semidesnatada-ecologica-puleva-brik-1-litro/',
                #707: f'https://supermercado.eroski.es/es/productdetail/12450268-kefir-ecologico-de-cabra-cantero-de-letur-frasco-420-g/',
                #901: f'https://supermercado.eroski.es/es/productdetail/16425886-tortellini-integral-queso-espinacas-natursoy-bandeja-250-g/',        
                #1201: f'https://supermercado.eroski.es/es/productdetail/311712-cerveza-mahou-clasica-lata-33-cl/'}

In [5]:
url_eroski = f'https://supermercado.eroski.es/es/productdetail/23628514-tomate-ecologico-eroski-natur-bio-al-peso-compra-minima-500-g/'
html = requests.get(url_eroski, headers=HEADER).text
soup_eroski = BeautifulSoup(html, 'lxml')


In [10]:
price_eroski_tomate = soup_eroski.find('span', {'class': 'price-product'}).text.split()[0]

In [19]:
unidad = soup_eroski.find('span', {'class': 'quantity-product'}).text.split()[0:2]
unidad = (' ').join(unidad)

In [36]:
def updating_eroski():
    
    #This function creates a tiny dataframe with the product's prices updated and dated every time it runs. Raws will be added to the main database.
    
    update_eroski_dataframe = pd.DataFrame(columns=['id', 'producto', 'tienda', 'precio', 'unidad', 'fecha'])
    tienda = 'Eroski'
    fecha = time.strftime('%A %H:%M %d-%m-%Y')
    
    # Temporary list of product to match id. TO DO: products table
    products_dict = {101: 'tomate ensalada', 102: 'plátano canario', 103: 'manzana royal', 104: 'calabacín', 105: 'zanahoria', 106: 'patata blanca'}
    
    print('Starting to scrape Eroski...')
    
    for product_id, url in eroski_url_dict.items():
        print(url)
        time.sleep(5)
        html = requests.get(url, headers=HEADER).text
        soup_eroski = BeautifulSoup(html, 'lxml')
        price_eroski = soup_eroski.find('span', {'class': 'price-product'})
        
        if price_eroski is None:
        
            try:
                precio = soup_eroski.find('span', {'class': 'offer-now'}).text.strip().replace(',','.')
                unidad = '1 KILO'
                
            except:

                precio = 'No disponible'
                unidad = 'No disponible'
        
        else:
            precio = price_eroski.text.split()[0]
            unidad = soup_eroski.find('span', {'class': 'quantity-product'}).text.split()[0:2]
            unidad = (' ').join(unidad)

        print(product_id, '--', precio, '--', unidad)

        producto = [v for k, v in products_dict.items() if k == product_id]
        
        # Adding new line to the dataframe
        update_eroski_dataframe.loc[len(update_eroski_dataframe)] = [product_id, producto[0], tienda, precio, unidad, fecha]
    
    return update_eroski_dataframe

In [37]:
updating_eroski()

Starting to scrape Eroski...
https://supermercado.eroski.es/es/productdetail/23628514-tomate-ecologico-eroski-natur-bio-al-peso-compra-minima-500-g/
101 -- 3,56 -- 1 KILO
https://supermercado.eroski.es/es/productdetail/23628209-platano-de-canarias-eco-eroski-natur-bio-al-peso-compra-minima-1-kg/
102 -- 2.99 -- 1 KILO
https://supermercado.eroski.es/es/productdetail/23628084-manzana-roja-ecologica-eroski-natur-bio-al-peso-compra-minima-1-kg/
103 -- No disponible -- No disponible
https://supermercado.eroski.es/es/productdetail/23628340-calabacin-ecologico-eroski-natur-bio-al-peso-compra-minima-500-g/
104 -- 1,90 -- 1 KILO
https://supermercado.eroski.es/es/productdetail/23628548-zanahoria-ecologica-eroski-natur-bio-bandeja-600-g/
105 -- 2,48 -- 1 KILO
https://supermercado.eroski.es/es/productdetail/23628431-patata-ecologica-eroski-natur-bio-al-peso-compra-minima-1-kg/
106 -- 2.39 -- 1 KILO


Unnamed: 0,id,producto,tienda,precio,unidad,fecha
0,101,tomate ensalada,Eroski,356,1 KILO,Tuesday 11:49 08-06-2021
1,102,plátano canario,Eroski,2.99,1 KILO,Tuesday 11:49 08-06-2021
2,103,manzana royal,Eroski,No disponible,No disponible,Tuesday 11:49 08-06-2021
3,104,calabacín,Eroski,190,1 KILO,Tuesday 11:49 08-06-2021
4,105,zanahoria,Eroski,248,1 KILO,Tuesday 11:49 08-06-2021
5,106,patata blanca,Eroski,2.39,1 KILO,Tuesday 11:49 08-06-2021
