In [1]:
!pip install pandas requests SQLAlchemy psycopg2-binary pyspark

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m919.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m843.5 kB/s[0m eta [36m0:00:00[0m:01[0m00:01[0m0m
[?25hInstalling collected packages: py4j, psycopg2-binary
Successfully installed psycopg2-binary-2.9.9 py4j-0.10.9.7


In [2]:
from os import environ as env
from pyspark.sql import SparkSession

In [3]:
# Variables de configuración de Postgres
DRIVER_PATH = env['DRIVER_PATH']
POSTGRES_HOST = env['POSTGRES_HOST']
POSTGRES_PORT = env['POSTGRES_PORT']
POSTGRES_DB = env['POSTGRES_DB']
POSTGRES_USER = env["POSTGRES_USER"]
POSTGRES_PASSWORD = env["POSTGRES_PASSWORD"]
POSTGRES_DRIVER = "org.postgresql.Driver"
POSTGRES_URL = f"jdbc:postgresql://{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"

In [4]:
env['PYSPARK_SUBMIT_ARGS'] = f'--driver-class-path {DRIVER_PATH} --jars {DRIVER_PATH} pyspark-shell'
env['SPARK_CLASSPATH'] = DRIVER_PATH

# Crear sesión de Spark
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("Spark y Postgres") \
    .config("spark.jars", DRIVER_PATH) \
    .config("spark.executor.extraClassPath", DRIVER_PATH) \
    .getOrCreate()

In [5]:
import requests
import json
from pyspark.sql import Row
import datetime

In [6]:
def get_most_relevant_items_for_category(category):
    url = f"https://api.mercadolibre.com/sites/MLA/search?category={category}#json"
    response = requests.get(url).text
    json_response = json.loads(response)
    data = json_response["results"]
    return data

def clean_string(string) -> str:
    return str(string).replace(' ', '').strip()

def load_to_postgres(spark, df, table):
    """
    Carga un DataFrame de pandas en Postgres.

    Parameters:
    df (pandas.DataFrame): El DataFrame de pandas a cargar.
    table (str): El nombre de la tabla en Postgres donde se cargará el DataFrame.

    """
    
    print("Cargar el PySpark DataFrame en Postgres") 
    try:
        df.write \
            .format("jdbc") \
            .option("url", POSTGRES_URL) \
            .option("dbtable", table) \
            .option("user", POSTGRES_USER) \
            .option("password", POSTGRES_PASSWORD) \
            .option("driver", POSTGRES_DRIVER) \
            .mode("overwrite") \
            .save()

        print("Dataframe subido")
    except Exception as e:
        print("Se produjo excepción:", e)

def main ():
    CATEGORY = "MLA1577"
    TABLE = "tecnica_ml"
    data = get_most_relevant_items_for_category(CATEGORY)
    DATE = str(datetime.date.today())
    # Crear un DataFrame de Spark
    rows = [Row(
        id=clean_string(item['id']),
        title=clean_string(item['title']),
        price=float(item['price']),
        thumbnail=clean_string(item['thumbnail']),
        create_date=DATE) for item in data]
    df = spark.createDataFrame(rows)
    df.show()
    load_to_postgres(spark, df, TABLE)

In [7]:
main()

+-------------+--------------------+---------+--------------------+-----------+
|           id|               title|    price|           thumbnail|create_date|
+-------------+--------------------+---------+--------------------+-----------+
|MLA1288020660|MicroondasGrillAt...| 104999.0|http://http2.mlst...| 2023-12-10|
|MLA1367525406|MicroondasBghQuic...| 139999.0|http://http2.mlst...| 2023-12-10|
|MLA1367709254|MicroondasBghEcoN...| 162349.0|http://http2.mlst...| 2023-12-10|
|MLA1134559453|MicroondasGrillAt...| 104999.0|http://http2.mlst...| 2023-12-10|
|MLA1551331924|MicroondasGrillAt...| 119999.0|http://http2.mlst...| 2023-12-10|
| MLA932432342|MicroondasBghB120...| 135999.0|http://http2.mlst...| 2023-12-10|
|MLA1142420159|MicroondasAtmaEas...|105449.05|http://http2.mlst...| 2023-12-10|
|MLA1381721041|MicroondasRcaRw20...| 134062.0|http://http2.mlst...| 2023-12-10|
|MLA1498154454|MicroondasDigital...| 136099.0|http://http2.mlst...| 2023-12-10|
|MLA1137532087|MicroondasBghEcoN...| 159