# Construcción tabla OBT

In [5]:
# --- Celda 0: inicialización (ejecutar primero) ---
import os
import requests
import tempfile
from dotenv import load_dotenv

# Paquetes que Spark debe cargar (ajusta versiones si es necesario)
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages net.snowflake:spark-snowflake_2.12:3.1.2,'
    'net.snowflake:snowflake-jdbc:3.24.2 pyspark-shell'
)

# Ahora sí importamos pyspark y creamos la sesión
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Evita sesiones múltiples: si existe, detenla y crea una nueva limpia
if 'spark' in globals():
    try:
        spark.stop()
    except Exception:
        pass

conf = SparkConf().setAppName("NYC_TLC_ingest").setMaster("local[*]")
# opcional: conf.set("spark.jars.packages", "net.snowflake:spark-snowflake_2.12:3.1.2,net.snowflake:snowflake-jdbc:3.24.2")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Comprueba
print("Spark inicializado:", spark.version)



Spark inicializado: 3.5.0


In [19]:
from dotenv import load_dotenv
import os

load_dotenv("/home/jovyan/work/.env", override=True)

account = os.getenv("SNOWFLAKE_ACCOUNT")
sf_url = os.getenv("SNOWFLAKE_URL") or account
if sf_url and not sf_url.endswith("snowflakecomputing.com"):
    sf_url = f"{sf_url}.snowflakecomputing.com"

if not all([account, os.getenv("SNOWFLAKE_USER"), os.getenv("SNOWFLAKE_PASSWORD")]):
    raise RuntimeError("Faltan variables de Snowflake en /home/jovyan/work/.env")

print("sfURL:", sf_url)




sfURL: LKVTWCT-PPC14557.snowflakecomputing.com


In [29]:
# Chunk PySpark: crear tabla OBT_TRIPS en Snowflake - CON SNOWFLAKE CONNECTOR
import os
from dotenv import load_dotenv

load_dotenv()

# --- SF options ---
sfOptions = {
    "sfURL": os.getenv("SNOWFLAKE_URL") or os.getenv("SNOWFLAKE_ACCOUNT"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA", "BRONZE"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WH"),
    "sfRole": os.getenv("SNOWFLAKE_ROLE"),
}

database = sfOptions["sfDatabase"]
source_schema = sfOptions["sfSchema"]  # Schema donde está ENRICHED_TRIPS
target_schema = "ANALYTICS"

# Consulta SQL para crear OBT_TRIPS - CORREGIDA
create_obt_table_query = f"""
CREATE OR REPLACE TABLE "{database}"."{target_schema}"."OBT_TRIPS" AS
SELECT
  INGEST_RUN_ID,
  INGEST_TIMESTAMP,
  SERVICE_TYPE,
  YEAR,
  MONTH,
  PICKUP_DATETIME,
  DROPOFF_DATETIME,
  VENDORID,
  PASSENGER_COUNT,
  TRIP_DISTANCE,
  RATECODEID,
  RATECODE_DESC,
  STORE_AND_FWD_FLAG_DESC,
  PULOCATIONID,
  PICKUP_ZONE,
  PICKUP_BOROUGH,
  PICKUP_SERVICE_ZONE,
  DOLOCATIONID,
  DROPOFF_ZONE,
  DROPOFF_BOROUGH,
  DROPOFF_SERVICE_ZONE,
  PAYMENT_TYPE,
  PAYMENT_TYPE_DESC,
  FARE_AMOUNT,
  TIP_AMOUNT,
  EXTRA,
  MTA_TAX,
  TOLLS_AMOUNT,
  IMPROVEMENT_SURCHARGE,
  CONGESTION_SURCHARGE,
  CBD_CONGESTION_FEE,
  EHAIL_FEE,
  TOTAL_AMOUNT,
  AIRPORT_FEE,
  TRIP_TYPE,
  DATEDIFF('second', PICKUP_DATETIME, DROPOFF_DATETIME) / 60.0 AS TRIP_DURATION_MIN,
  CASE
    WHEN DATEDIFF('second', PICKUP_DATETIME, DROPOFF_DATETIME) > 0
         AND TRIP_DISTANCE IS NOT NULL
    THEN TRIP_DISTANCE * 3600.0 / NULLIF(DATEDIFF('second', PICKUP_DATETIME, DROPOFF_DATETIME), 0)
    ELSE NULL
  END AS AVG_SPEED_MPH,
  CASE
    WHEN FARE_AMOUNT IS NOT NULL AND FARE_AMOUNT <> 0
    THEN TIP_AMOUNT / FARE_AMOUNT * 100.0
    ELSE NULL
  END AS TIP_PCT
FROM "{database}"."{source_schema}"."ENRICHED_TRIPS"
WHERE DATEDIFF('second', PICKUP_DATETIME, DROPOFF_DATETIME) > 0
"""

print("Ejecutando creación de tabla OBT_TRIPS usando Snowflake Connector...")

try:
    # Intentar importar snowflake-connector-python
    import snowflake.connector
    
    print("✅ snowflake-connector-python está instalado")
    
    # Extraer información de conexión de sfOptions
    # El formato de account normalmente es lo que está antes de .snowflakecomputing.com
    account = sfOptions['sfURL']
    if '.snowflakecomputing.com' in account:
        account = account.split('.snowflakecomputing.com')[0]
    
    # Conectar directamente a Snowflake
    print("Conectando a Snowflake...")
    conn = snowflake.connector.connect(
        user=sfOptions['sfUser'],
        password=sfOptions['sfPassword'],
        account=account,
        warehouse=sfOptions['sfWarehouse'],
        database=sfOptions['sfDatabase'],
        schema=sfOptions['sfSchema'],
        role=sfOptions['sfRole']
    )
    
    print("✅ Conexión establecida")
    
    # Crear cursor
    cursor = conn.cursor()
    
    # Ejecutar la consulta
    print("Ejecutando consulta CREATE TABLE para OBT_TRIPS...")
    cursor.execute(create_obt_table_query)
    
    # Obtener resultado
    result = cursor.fetchone()
    print(f"✅ Consulta ejecutada. Resultado: {result}")
    
    # Cerrar conexión
    cursor.close()
    conn.close()
    
    print("✅ Tabla OBT_TRIPS creada exitosamente en el esquema ANALYTICS")
    
except ImportError:
    print("❌ snowflake-connector-python no está instalado")
    print("Instálalo con: pip install snowflake-connector-python")
except Exception as e:
    print(f"❌ Error durante la ejecución: {str(e)}")

Ejecutando creación de tabla OBT_TRIPS usando Snowflake Connector...
✅ snowflake-connector-python está instalado
Conectando a Snowflake...
✅ Conexión establecida
Ejecutando consulta CREATE TABLE para OBT_TRIPS...
✅ Consulta ejecutada. Resultado: ('Table OBT_TRIPS successfully created.',)
✅ Tabla OBT_TRIPS creada exitosamente en el esquema ANALYTICS
