# Enriquecimiento con Taxi Zones y unificación de Green con Yellow trips

In [1]:
# --- Celda 0: inicialización (ejecutar primero) ---
import os
import requests
import tempfile
from dotenv import load_dotenv

# Paquetes que Spark debe cargar (ajusta versiones si es necesario)
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages net.snowflake:spark-snowflake_2.12:3.1.2,'
    'net.snowflake:snowflake-jdbc:3.24.2 pyspark-shell'
)

# Ahora sí importamos pyspark y creamos la sesión
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Evita sesiones múltiples: si existe, detenla y crea una nueva limpia
if 'spark' in globals():
    try:
        spark.stop()
    except Exception:
        pass

conf = SparkConf().setAppName("NYC_TLC_ingest").setMaster("local[*]")
# opcional: conf.set("spark.jars.packages", "net.snowflake:spark-snowflake_2.12:3.1.2,net.snowflake:snowflake-jdbc:3.24.2")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Comprueba
print("Spark inicializado:", spark.version)



Spark inicializado: 3.5.0


In [2]:
from dotenv import load_dotenv
import os

load_dotenv("/home/jovyan/work/.env", override=True)

account = os.getenv("SNOWFLAKE_ACCOUNT")
sf_url = os.getenv("SNOWFLAKE_URL") or account
if sf_url and not sf_url.endswith("snowflakecomputing.com"):
    sf_url = f"{sf_url}.snowflakecomputing.com"

if not all([account, os.getenv("SNOWFLAKE_USER"), os.getenv("SNOWFLAKE_PASSWORD")]):
    raise RuntimeError("Faltan variables de Snowflake en /home/jovyan/work/.env")

print("sfURL:", sf_url)




sfURL: LKVTWCT-PPC14557.snowflakecomputing.com


In [25]:
import os
from dotenv import load_dotenv
import requests
import tempfile
import logging
import pandas as pd
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas

# --- logging ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- credenciales ---
load_dotenv()

account = os.getenv("SNOWFLAKE_ACCOUNT")
user = os.getenv("SNOWFLAKE_USER")
password = os.getenv("SNOWFLAKE_PASSWORD")
warehouse = os.getenv("SNOWFLAKE_WH")
database = os.getenv("SNOWFLAKE_DATABASE")
schema = os.getenv("SNOWFLAKE_SCHEMA", "BRONZE")
role = os.getenv("SNOWFLAKE_ROLE")

# --- validaciones ---
if not all([user, password, warehouse, database]):
    raise RuntimeError("Faltan credenciales Snowflake: revisa SNOWFLAKE_USER/PASSWORD/WH/DATABASE en .env")

if 'spark' not in globals():
    logger.warning("No se detectó 'spark' en globals(). El flujo usa pandas/write_pandas, no Spark.")

# --- URL CSV ---
url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

# --- archivo temporal ---
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
tmp_path = tmp.name
tmp.close()

def download_file(url, temp_path, timeout=30):
    """Descarga con manejo de errores"""
    try:
        resp = requests.get(url, stream=True, timeout=timeout)
        resp.raise_for_status()
        with open(temp_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        return True
    except requests.exceptions.RequestException as e:
        logger.error("Error descargando %s: %s", url, e)
        return False

try:
    if not download_file(url, tmp_path):
        raise RuntimeError(f"No se pudo descargar {url}")

    # --- leer CSV ---
    df = pd.read_csv(tmp_path, encoding='utf-8')
    df.columns = [c.strip().upper() for c in df.columns]

    # --- conectar a Snowflake ---
    account_for_connector = account
    if account and account.endswith(".snowflakecomputing.com"):
        account_for_connector = account.replace(".snowflakecomputing.com", "")

    conn = snowflake.connector.connect(
        user=user,
        password=password,
        account=account_for_connector,
        warehouse=warehouse,
        database=database,
        schema=schema,
        role=role
    )

    cs = conn.cursor()

    # --- Crear tabla si no existe ---
    create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {database}.{schema}.TAXI_ZONES (
        LOCATIONID INTEGER,
        BOROUGH STRING,
        ZONE STRING,
        SERVICE_ZONE STRING
    );
    """
    logger.info("Verificando/creando tabla TAXI_ZONES...")
    cs.execute(create_table_sql)
    cs.close()

    # --- Cargar datos ---
    success, nchunks, nrows, _ = write_pandas(conn, df, 'TAXI_ZONES', database=database, schema=schema)
    logger.info("write_pandas success=%s nchunks=%s rows=%s", success, nchunks, nrows)

except Exception as e:
    logger.exception("Error en la ejecución: %s", e)
    raise

finally:
    if conn is not None:
        conn.close()
    try:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
    except Exception as e:
        logger.warning("No se pudo borrar temp file %s: %s", tmp_path, e)


INFO:snowflake.connector.connection:Snowflake Connector for Python Version: 3.18.0, Python Version: 3.11.6, Platform: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35
INFO:snowflake.connector.connection:Connecting to GLOBAL Snowflake domain
INFO:__main__:Verificando/creando tabla TAXI_ZONES...
INFO:__main__:write_pandas success=True nchunks=1 rows=265


In [11]:
# Chunk PySpark: unir, normalizar, enriquecer y escribir a Snowflake - CON SNOWFLAKE CONNECTOR
import os
from dotenv import load_dotenv

load_dotenv()

# --- SF options ---
sfOptions = {
    "sfURL": os.getenv("SNOWFLAKE_URL") or os.getenv("SNOWFLAKE_ACCOUNT"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA", "BRONZE"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WH"),
    "sfRole": os.getenv("SNOWFLAKE_ROLE"),
}

database = sfOptions["sfDatabase"]
schema = sfOptions["sfSchema"]
target_schema = "BRONZE"

# Consulta SQL - optimizada para mejor rendimiento
create_table_query = f"""
CREATE OR REPLACE TABLE "{database}"."{target_schema}"."ENRICHED_TRIPS" AS
WITH green AS (
    SELECT
        NULL AS AIRPORT_FEE,
        INGEST_RUN_ID,
        INGEST_TIMESTAMP,
        YEAR,
        MONTH,
        VENDORID,
        PASSENGER_COUNT,
        TRIP_DISTANCE,
        PULOCATIONID,
        DOLOCATIONID,
        RATECODEID,
        STORE_AND_FWD_FLAG,
        PAYMENT_TYPE,
        FARE_AMOUNT,
        TIP_AMOUNT,
        TOTAL_AMOUNT,
        EXTRA,
        MTA_TAX,
        TOLLS_AMOUNT,
        IMPROVEMENT_SURCHARGE,
        CONGESTION_SURCHARGE,
        CBD_CONGESTION_FEE,
        EHAIL_FEE,
        TRIP_TYPE,
        SERVICE AS SERVICE_TYPE,
        LPEP_PICKUP_DATETIME AS PICKUP_DATETIME,
        LPEP_DROPOFF_DATETIME AS DROPOFF_DATETIME
    FROM "{database}"."{schema}"."GREEN_TRIPS"
),

yellow AS (
    SELECT
        AIRPORT_FEE,
        INGEST_RUN_ID,
        INGEST_TIMESTAMP,
        YEAR,
        MONTH,
        VENDORID,
        PASSENGER_COUNT,
        TRIP_DISTANCE,
        PULOCATIONID,
        DOLOCATIONID,
        RATECODEID,
        STORE_AND_FWD_FLAG,
        PAYMENT_TYPE,
        FARE_AMOUNT,
        TIP_AMOUNT,
        TOTAL_AMOUNT,
        EXTRA,
        MTA_TAX,
        TOLLS_AMOUNT,
        IMPROVEMENT_SURCHARGE,
        CONGESTION_SURCHARGE,
        CBD_CONGESTION_FEE,
        NULL AS EHAIL_FEE,
        NULL AS TRIP_TYPE,
        SERVICE AS SERVICE_TYPE,
        TPEP_PICKUP_DATETIME AS PICKUP_DATETIME,
        TPEP_DROPOFF_DATETIME AS DROPOFF_DATETIME
    FROM "{database}"."{schema}"."YELLOW_TRIPS"
),

unioned_trips AS (
    SELECT * FROM green
    UNION ALL
    SELECT * FROM yellow
),

standardized_trips AS (
    SELECT
        *,
        CONVERT_TIMEZONE('UTC', 'America/New_York', PICKUP_DATETIME) AS PICKUP_DATETIME_EST,
        CONVERT_TIMEZONE('UTC', 'America/New_York', DROPOFF_DATETIME) AS DROPOFF_DATETIME_EST,
        
        CASE 
            WHEN PAYMENT_TYPE = 1 THEN 'Credit card'
            WHEN PAYMENT_TYPE = 2 THEN 'Cash'
            WHEN PAYMENT_TYPE = 3 THEN 'No charge'
            WHEN PAYMENT_TYPE = 4 THEN 'Dispute'
            WHEN PAYMENT_TYPE = 6 THEN 'Voided trip'
            ELSE 'Not specified'
        END AS PAYMENT_TYPE_DESC,
        
        CASE 
            WHEN RATECODEID = 1 THEN 'Standard rate'
            WHEN RATECODEID = 2 THEN 'JFK'
            WHEN RATECODEID = 3 THEN 'Newark'
            WHEN RATECODEID = 4 THEN 'Nassau or Westchester'
            WHEN RATECODEID = 5 THEN 'Negotiated fare'
            WHEN RATECODEID = 6 THEN 'Group ride'
            ELSE 'Unknown'
        END AS RATECODE_DESC,
        
        CASE 
            WHEN UPPER(STORE_AND_FWD_FLAG) = 'Y' THEN 'Yes'
            WHEN UPPER(STORE_AND_FWD_FLAG) = 'N' THEN 'No'
            ELSE 'Unknown'
        END AS STORE_AND_FWD_FLAG_DESC
    FROM unioned_trips
),

enriched_with_zones AS (
    SELECT
        st.*,
        pz.zone AS PICKUP_ZONE,
        pz.borough AS PICKUP_BOROUGH,
        pz.service_zone AS PICKUP_SERVICE_ZONE,
        dz.zone AS DROPOFF_ZONE,
        dz.borough AS DROPOFF_BOROUGH,
        dz.service_zone AS DROPOFF_SERVICE_ZONE
    FROM standardized_trips st
    LEFT JOIN "{database}"."{schema}"."TAXI_ZONES" pz 
        ON st.PULOCATIONID = pz.locationid
    LEFT JOIN "{database}"."{schema}"."TAXI_ZONES" dz 
        ON st.DOLOCATIONID = dz.locationid
)

SELECT
    INGEST_RUN_ID,
    INGEST_TIMESTAMP,
    SERVICE_TYPE,
    YEAR,
    MONTH,
    PICKUP_DATETIME_EST AS PICKUP_DATETIME,
    DROPOFF_DATETIME_EST AS DROPOFF_DATETIME,
    VENDORID,
    PASSENGER_COUNT,
    TRIP_DISTANCE,
    RATECODEID,
    RATECODE_DESC,
    STORE_AND_FWD_FLAG_DESC,
    PULOCATIONID,
    PICKUP_ZONE,
    PICKUP_BOROUGH,
    PICKUP_SERVICE_ZONE,
    DOLOCATIONID, 
    DROPOFF_ZONE,
    DROPOFF_BOROUGH,
    DROPOFF_SERVICE_ZONE,
    PAYMENT_TYPE,
    PAYMENT_TYPE_DESC,
    FARE_AMOUNT,
    TIP_AMOUNT,
    EXTRA,
    MTA_TAX,
    TOLLS_AMOUNT,
    IMPROVEMENT_SURCHARGE,
    CONGESTION_SURCHARGE,
    CBD_CONGESTION_FEE,
    EHAIL_FEE,
    TOTAL_AMOUNT,
    AIRPORT_FEE,
    TRIP_TYPE
FROM enriched_with_zones
"""

print("Ejecutando creación de tabla usando Snowflake Connector...")

try:
    # Intentar importar snowflake-connector-python
    import snowflake.connector
    
    print("✅ snowflake-connector-python está instalado")
    
    # Extraer información de conexión de sfOptions
    # El formato de account normalmente es lo que está antes de .snowflakecomputing.com
    account = sfOptions['sfURL']
    if '.snowflakecomputing.com' in account:
        account = account.split('.snowflakecomputing.com')[0]
    
    # Conectar directamente a Snowflake
    print("Conectando a Snowflake...")
    conn = snowflake.connector.connect(
        user=sfOptions['sfUser'],
        password=sfOptions['sfPassword'],
        account=account,
        warehouse=sfOptions['sfWarehouse'],
        database=sfOptions['sfDatabase'],
        schema=sfOptions['sfSchema'],
        role=sfOptions['sfRole']
    )
    
    print("✅ Conexión establecida")
    
    # Crear cursor
    cursor = conn.cursor()
    
    # Ejecutar la consulta
    print("Ejecutando consulta CREATE TABLE...")
    cursor.execute(create_table_query)
    
    # Obtener resultado
    result = cursor.fetchone()
    print(f"✅ Consulta ejecutada. Resultado: {result}")
    
    # Cerrar conexión
    cursor.close()
    conn.close()
    
    print("✅ Tabla creada exitosamente usando snowflake-connector-python")
    
except ImportError:
    print("snowflake-connector-python no está instalado")
    print("Instálalo con: pip install snowflake-connector-python")
    
    # Fallback a la opción de Spark
    print("Intentando con Spark SQL como fallback...")
    try:
        spark.sql(create_table_query)
        print("✅ Tabla creada exitosamente con Spark SQL")
        
        # Verificar
        check_df = spark.read \
            .format("snowflake") \
            .options(**sfOptions) \
            .option("dbtable", f"{database}.{target_schema}.ENRICHED_TRIPS") \
            .load()
        
        print(f"✅ Tabla verificada. Registros: {check_df.count()}")
        check_df.show(5)
        
    except Exception as e:
        print(f"Error con Spark SQL: {str(e)}")

except Exception as e:
    print(f"Error con snowflake-connector: {str(e)}")
    
    # Intentar con una versión más simple para diagnóstico
    print("Intentando con consulta simplificada para diagnóstico...")
    
    try:
        import snowflake.connector
        
        account = sfOptions['sfURL']
        if '.snowflakecomputing.com' in account:
            account = account.split('.snowflakecomputing.com')[0]
        
        conn = snowflake.connector.connect(
            user=sfOptions['sfUser'],
            password=sfOptions['sfPassword'],
            account=account,
            warehouse=sfOptions['sfWarehouse'],
            database=sfOptions['sfDatabase'],
            schema=sfOptions['sfSchema'],
            role=sfOptions['sfRole']
        )
        
        cursor = conn.cursor()
        
        # Consulta de prueba simple
        test_query = f"""
        CREATE OR REPLACE TABLE "{database}"."{target_schema}"."ENRICHED_TRIPS_TEST" AS
        SELECT 
            'test' as source,
            COUNT(*) as total_records
        FROM "{database}"."{schema}"."GREEN_TRIPS"
        LIMIT 1
        """
        
        cursor.execute(test_query)
        result = cursor.fetchone()
        print(f"✅ Prueba exitosa. Resultado: {result}")
        
        cursor.close()
        conn.close()
        
    except Exception as e2:
        print(f"Error en prueba: {str(e2)}")


Ejecutando creación de tabla usando Snowflake Connector...
✅ snowflake-connector-python está instalado
Conectando a Snowflake...
✅ Conexión establecida
Ejecutando consulta CREATE TABLE...
✅ Consulta ejecutada. Resultado: ('Table ENRICHED_TRIPS successfully created.',)
✅ Tabla creada exitosamente usando snowflake-connector-python
Verificando tabla con Spark...


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 