In [22]:
# Data Libraries
# -----------------------------------------------------------------------
import pandas as pd

# Pandas display configuration
pd.set_option('display.max_columns', None)

# Path configuration for custom module imports
# -----------------------------------------------------------------------
import sys
sys.path.append('../')  # Adds the parent directory to the path for custom module imports

# Importing custom functions
from src.support import bulk_scraping, extract_info_ai, change_id

# Environment variables and API configuration
# -----------------------------------------------------------------------
import os
import dotenv
from openai import OpenAI

# OpenAI API setup
OPENAI_API_KEY = os.getenv('token')

# Load environment variables from .env file
dotenv.load_dotenv()

True

In [23]:
# List of supermarkets and categories
supermarket_list = ['mercadona', 'carrefour', 'eroski', 'dia', 'hipercor', 'alcampo']
category_list = ['aceite-de-girasol', 'aceite-de-oliva', 'leche']

### DataFrame loading or creation in case they don't exist

`Categories`

In [24]:
try:
    df_category = pd.read_csv('../data/categorias.csv')
except:
    df_category = pd.DataFrame(category_list).reset_index().rename(columns={'index': 'category_id', 0: 'category'})
    df_category.to_csv('../data/categorias.csv', index = False)

`Supermarkets`

In [25]:
try:
    df_supermarket = pd.read_csv('../data/supermercados.csv')
except:
    df_supermarket = pd.DataFrame(supermarket_list).reset_index().rename(columns={'index': 'supermarket_id', 0: 'supermarket'})
    df_supermarket.to_csv('../data/supermercados.csv', index = False)

`Historic Data`

In [26]:
try:
    df_historic = pd.read_csv('../data/historial.csv')

except:
    df_historic = bulk_scraping(supermarket_list, category_list)
    df_historic.to_csv('../data/historial.csv', index = False)

`Products`

In [27]:
try:
    df_products = pd.read_csv('../data/products.csv')

except:
    # Get only unique products and urls
    df_products = df_historic[['url', 'product']].drop_duplicates()
    df_products.reset_index(drop=True, inplace=True)

    # Extract additional info from product names using AI
    result_df = df_products['product'].apply(extract_info_ai)
    result_df = result_df.apply(pd.Series)

    # Concatenate results
    df_products = pd.concat([df_products, result_df], axis = 1)

    # Reset index to product_id column
    df_products = df_products.reset_index().rename(columns={'index': 'product_id'})

    # Save the dataframe
    df_products.to_csv('../data/products.csv', index = False)

In [28]:
try:
    # Arrange columns and save the dataframe again
    change_id(df_historic, df_supermarket, 'supermarket')
    change_id(df_historic, df_category, 'category')
    change_id(df_historic, df_products, 'product')
    df_historic.drop(columns='url', inplace=True)

    df_historic.to_csv('../data/historial.csv', index = False)

except:
    pass

Quick look at the data

In [29]:
df_category.head()

Unnamed: 0,category_id,category
0,0,aceite-de-girasol
1,1,aceite-de-oliva
2,2,leche


In [30]:
df_supermarket.head()

Unnamed: 0,supermarket_id,supermarket
0,0,mercadona
1,1,carrefour
2,2,eroski
3,3,dia
4,4,hipercor


In [34]:
df_products.head()

Unnamed: 0,product_id,url,product,category,subcategory,brand,volume,weight,details
0,0,https://super.facua.org/mercadona/aceite-de-gi...,"Aceite De Girasol Refinado 0,2º Hacendado 1 L.",aceite_girasol,0.2º,hacendado,1.0,,
1,1,https://super.facua.org/mercadona/aceite-de-gi...,"Aceite De Girasol Refinado 0,2º Hacendado 5 L.",aceite_girasol,0.2º,hacendado,5.0,,refinado
2,2,https://super.facua.org/mercadona/aceite-de-ol...,"Aceite De Oliva 0,4º Hacendado 1 L.",aceite_oliva,0.4º,hacendado,1.0,,
3,3,https://super.facua.org/mercadona/aceite-de-ol...,Aceite De Oliva 1º Hacendado 1 L.,aceite_oliva,1º,hacendado,1.0,,
4,4,https://super.facua.org/mercadona/aceite-de-ol...,Aceite De Oliva Intenso Hacendado 3 L.,aceite_oliva,intenso,hacendado,3.0,,


In [33]:
df_historic.sample(5)

Unnamed: 0,Date,Price (€),Delta Price,product_id,supermarket_id,category_id
126649,2024-09-01,4.62,0.0,1489,5,2
79995,2024-07-14,54.24,0.0,949,4,1
101762,2024-08-13,1.16,0.0,1229,4,2
93301,2024-10-19,1.66,0.0,1115,4,2
129332,2024-09-27,1.68,0.0,1520,5,2
