In [6]:
# Data Libraries
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Pandas display configuration
pd.set_option('display.max_columns', None)

# Path configuration for custom module imports
# -----------------------------------------------------------------------
import sys
sys.path.append('../')  # Adds the parent directory to the path for custom module imports

# Importing custom functions
from src.support import bulk_scraping, extract_info_ai

# Environment variables and API configuration
# -----------------------------------------------------------------------
import os
import dotenv
from openai import OpenAI

# OpenAI API setup
OPENAI_API_KEY = os.getenv('token')

# Load environment variables from .env file
dotenv.load_dotenv()


True

In [7]:
supermarket_list = ['mercadona', 'carrefour', 'eroski', 'dia', 'hipercor', 'alcampo']
category_list = ['aceite-de-girasol', 'aceite-de-oliva', 'leche']

In [8]:
try:
    df_category = pd.read_csv('../data/categorias.csv')
except:
    df_category = pd.DataFrame(category_list).reset_index().rename(columns={'index': 'category_id', 0: 'category'})
    df_category.to_csv('../data/categorias.csv', index = False)

try:
    df_supermarket = pd.read_csv('../data/supermercados.csv')
except:
    df_supermarket = pd.DataFrame(supermarket_list).reset_index().rename(columns={'index': 'supermarket_id', 0: 'supermarket'})
    df_supermarket.to_csv('../data/supermercados.csv', index = False)

In [9]:
try:
    df_historic = pd.read_csv('../data/historial.csv')

except:
    df_historic = bulk_scraping(supermarket_list, category_list)
    df_historic.to_csv('../data/historial.csv', index = False)

In [10]:
try:
    df_products = pd.read_csv('../data/products.csv')

except:
    df_products = df_historic[['url', 'product']].drop_duplicates()
    df_products.reset_index(drop=True, inplace=True)

    result_df = df_products['product'].apply(extract_info_ai)
    result_df = result_df.apply(pd.Series)

    # Concatenate results
    df_products = pd.concat([df_products, result_df], axis = 1)

    # Save the dataframe
    df_products.to_csv('../data/products.csv', index = False)