# Data Analysis

In [1]:
# --- Celda 0: inicialización (ejecutar primero) ---
import os
import requests
import tempfile
from dotenv import load_dotenv

# Paquetes que Spark debe cargar (ajusta versiones si es necesario)
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages net.snowflake:spark-snowflake_2.12:3.1.2,'
    'net.snowflake:snowflake-jdbc:3.24.2 pyspark-shell'
)

# Ahora sí importamos pyspark y creamos la sesión
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Evita sesiones múltiples: si existe, detenla y crea una nueva limpia
if 'spark' in globals():
    try:
        spark.stop()
    except Exception:
        pass

conf = SparkConf().setAppName("NYC_TLC_ingest").setMaster("local[*]")
# opcional: conf.set("spark.jars.packages", "net.snowflake:spark-snowflake_2.12:3.1.2,net.snowflake:snowflake-jdbc:3.24.2")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Comprueba
print("Spark inicializado:", spark.version)



Spark inicializado: 3.5.0


In [2]:
from dotenv import load_dotenv
import os

load_dotenv("/home/jovyan/work/.env", override=True)

account = os.getenv("SNOWFLAKE_ACCOUNT")
sf_url = os.getenv("SNOWFLAKE_URL") or account
if sf_url and not sf_url.endswith("snowflakecomputing.com"):
    sf_url = f"{sf_url}.snowflakecomputing.com"

if not all([account, os.getenv("SNOWFLAKE_USER"), os.getenv("SNOWFLAKE_PASSWORD")]):
    raise RuntimeError("Faltan variables de Snowflake en /home/jovyan/work/.env")

print("sfURL:", sf_url)




sfURL: LKVTWCT-PPC14557.snowflakecomputing.com


In [23]:
# Chunk PySpark: Análisis completo de datos OBT_TRIPS - Exportación simple a Excel
import os
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime

load_dotenv()

# --- SF options ---
sfOptions = {
    "sfURL": os.getenv("SNOWFLAKE_URL") or os.getenv("SNOWFLAKE_ACCOUNT"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA", "BRONZE"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WH"),
    "sfRole": os.getenv("SNOWFLAKE_ROLE"),
}

database = sfOptions["sfDatabase"]
target_schema = "ANALYTICS"

# Consultas de análisis (las mismas que tenías)
analysis_queries = {
    "a_top_10_pickup_zonas": f"""
    SELECT 
        PICKUP_ZONE,
        PICKUP_BOROUGH,
        YEAR,
        MONTH,
        COUNT(*) as volumen_viajes,
        RANK() OVER (PARTITION BY YEAR, MONTH ORDER BY COUNT(*) DESC) as ranking_mensual
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_ZONE IS NOT NULL
    GROUP BY PICKUP_ZONE, PICKUP_BOROUGH, YEAR, MONTH
    QUALIFY ranking_mensual <= 10
    ORDER BY YEAR, MONTH, ranking_mensual
    """,
    
    "b_top_10_dropoff_zonas": f"""
    SELECT 
        DROPOFF_ZONE,
        DROPOFF_BOROUGH,
        YEAR,
        MONTH,
        COUNT(*) as volumen_viajes,
        RANK() OVER (PARTITION BY YEAR, MONTH ORDER BY COUNT(*) DESC) as ranking_mensual
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE DROPOFF_ZONE IS NOT NULL
    GROUP BY DROPOFF_ZONE, DROPOFF_BOROUGH, YEAR, MONTH
    QUALIFY ranking_mensual <= 10
    ORDER BY YEAR, MONTH, ranking_mensual
    """,
    
    "c_evolucion_mensual_borough": f"""
    SELECT 
        PICKUP_BOROUGH,
        YEAR,
        MONTH,
        SUM(TOTAL_AMOUNT) as total_amount,
        AVG(TIP_PCT) as avg_tip_pct,
        COUNT(*) as total_viajes,
        SUM(TOTAL_AMOUNT) / COUNT(*) as ticket_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_BOROUGH IS NOT NULL
    GROUP BY PICKUP_BOROUGH, YEAR, MONTH
    ORDER BY PICKUP_BOROUGH, YEAR, MONTH
    """,
    
    "d_ticket_promedio_service_mes": f"""
    SELECT 
        SERVICE_TYPE,
        YEAR,
        MONTH,
        AVG(TOTAL_AMOUNT) as avg_total_amount,
        COUNT(*) as total_viajes,
        SUM(TOTAL_AMOUNT) as ingreso_total
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    GROUP BY SERVICE_TYPE, YEAR, MONTH
    ORDER BY SERVICE_TYPE, YEAR, MONTH
    """,
    
    "e_viajes_hora_dia_semana": f"""
    SELECT 
        DAYNAME(PICKUP_DATETIME) as dia_semana,
        HOUR(PICKUP_DATETIME) as hora_dia,
        COUNT(*) as total_viajes,
        AVG(TOTAL_AMOUNT) as ticket_promedio,
        AVG(TRIP_DURATION_MIN) as duracion_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    GROUP BY dia_semana, hora_dia
    ORDER BY 
        CASE dia_semana
            WHEN 'Monday' THEN 1
            WHEN 'Tuesday' THEN 2
            WHEN 'Wednesday' THEN 3
            WHEN 'Thursday' THEN 4
            WHEN 'Friday' THEN 5
            WHEN 'Saturday' THEN 6
            WHEN 'Sunday' THEN 7
        END,
        hora_dia
    """,
    
    "f_p50_p90_duracion_borough": f"""
    SELECT 
        PICKUP_BOROUGH,
        COUNT(*) as total_viajes,
        MEDIAN(TRIP_DURATION_MIN) as p50_duracion_min,
        APPROX_PERCENTILE(0.9) WITHIN GROUP (ORDER BY TRIP_DURATION_MIN) as p90_duracion_min,
        AVG(TRIP_DURATION_MIN) as duracion_promedio,
        COUNT(CASE WHEN TRIP_DURATION_MIN > 60 THEN 1 END) as viajes_mas_1_hora
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_BOROUGH IS NOT NULL 
    AND TRIP_DURATION_MIN > 0 
    AND TRIP_DURATION_MIN < 180  -- Excluir outliers extremos
    GROUP BY PICKUP_BOROUGH
    ORDER BY total_viajes DESC
    """,

    "g_velocidad_franja_horaria": f"""
    SELECT 
        PICKUP_BOROUGH,
        CASE 
            WHEN HOUR(PICKUP_DATETIME) BETWEEN 6 AND 9 THEN 'Mañana 6-9'
            WHEN HOUR(PICKUP_DATETIME) BETWEEN 17 AND 20 THEN 'Tarde 17-20'
            ELSE 'Otra franja'
        END as franja_horaria,
        COUNT(*) as total_viajes,
        AVG(AVG_SPEED_MPH) as velocidad_promedio_mph,
        AVG(TRIP_DURATION_MIN) as duracion_promedio_min
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_BOROUGH IS NOT NULL AND AVG_SPEED_MPH > 0 AND AVG_SPEED_MPH < 100
    GROUP BY PICKUP_BOROUGH, franja_horaria
    HAVING franja_horaria != 'Otra franja'
    ORDER BY PICKUP_BOROUGH, franja_horaria
    """,
    
    "h_participacion_payment_type": f"""
    SELECT 
        PAYMENT_TYPE_DESC,
        COUNT(*) as total_viajes,
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER () as participacion_pct,
        AVG(TIP_PCT) as tip_pct_promedio,
        AVG(TIP_AMOUNT) as tip_amount_promedio,
        AVG(TOTAL_AMOUNT) as ticket_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PAYMENT_TYPE_DESC IS NOT NULL
    GROUP BY PAYMENT_TYPE_DESC
    ORDER BY total_viajes DESC
    """,
    
    "i_rate_code_metrics": f"""
    SELECT 
        RATECODE_DESC,
        COUNT(*) as total_viajes,
        AVG(TRIP_DISTANCE) as distancia_promedio_millas,
        AVG(TOTAL_AMOUNT) as ticket_promedio,
        SUM(TRIP_DISTANCE) as distancia_total_millas,
        SUM(TOTAL_AMOUNT) as ingreso_total
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE RATECODE_DESC IS NOT NULL
    GROUP BY RATECODE_DESC
    ORDER BY ingreso_total DESC
    """,
    
    "j_mix_yellow_green": f"""
    SELECT 
        PICKUP_BOROUGH,
        YEAR,
        MONTH,
        SERVICE_TYPE,
        COUNT(*) as total_viajes,
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY PICKUP_BOROUGH, YEAR, MONTH) as participacion_pct
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_BOROUGH IS NOT NULL AND SERVICE_TYPE IN ('yellow', 'green')
    GROUP BY PICKUP_BOROUGH, YEAR, MONTH, SERVICE_TYPE
    ORDER BY PICKUP_BOROUGH, YEAR, MONTH, SERVICE_TYPE
    """,
    
    "k_top_20_flujos_pu_do": f"""
    SELECT 
        PICKUP_ZONE,
        DROPOFF_ZONE,
        COUNT(*) as volumen_viajes,
        AVG(TOTAL_AMOUNT) as ticket_promedio,
        AVG(TRIP_DURATION_MIN) as duracion_promedio,
        AVG(TRIP_DISTANCE) as distancia_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_ZONE IS NOT NULL AND DROPOFF_ZONE IS NOT NULL
    GROUP BY PICKUP_ZONE, DROPOFF_ZONE
    ORDER BY volumen_viajes DESC
    LIMIT 20
    """,
    
    "l_distribucion_passenger_count": f"""
    SELECT 
        PASSENGER_COUNT,
        COUNT(*) as total_viajes,
        COUNT(*) * 100.0 / SUM(COUNT(*)) OVER () as distribucion_pct,
        AVG(TOTAL_AMOUNT) as ticket_promedio,
        AVG(TRIP_DISTANCE) as distancia_promedio,
        AVG(TRIP_DURATION_MIN) as duracion_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PASSENGER_COUNT IS NOT NULL AND PASSENGER_COUNT > 0
    GROUP BY PASSENGER_COUNT
    ORDER BY PASSENGER_COUNT
    """,
    
    "m_impacto_tolls_congestion": f"""
    SELECT 
        PICKUP_ZONE,
        COUNT(*) as total_viajes,
        AVG(TOLLS_AMOUNT) as peaje_promedio,
        AVG(CONGESTION_SURCHARGE) as congestion_promedio,
        AVG(TOLLS_AMOUNT + CONGESTION_SURCHARGE) as recargos_totales_promedio,
        (AVG(TOLLS_AMOUNT) + AVG(CONGESTION_SURCHARGE)) * 100.0 / AVG(TOTAL_AMOUNT) as pct_del_total
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_ZONE IS NOT NULL
    GROUP BY PICKUP_ZONE
    HAVING total_viajes > 100
    ORDER BY recargos_totales_promedio DESC
    LIMIT 20
    """,
    
    "n_proporcion_viajes_cortos_largos": f"""
    WITH estadisticas AS (
        SELECT 
            PICKUP_BOROUGH,
            MONTH,
            PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY TRIP_DURATION_MIN) as umbral_largo
        FROM "{database}"."{target_schema}"."OBT_TRIPS"
        WHERE PICKUP_BOROUGH IS NOT NULL
        GROUP BY PICKUP_BOROUGH, MONTH
    )
    SELECT 
        t.PICKUP_BOROUGH,
        t.MONTH,
        COUNT(*) as total_viajes,
        SUM(CASE WHEN t.TRIP_DURATION_MIN <= 15 THEN 1 ELSE 0 END) as viajes_cortos,
        SUM(CASE WHEN t.TRIP_DURATION_MIN > 15 AND t.TRIP_DURATION_MIN <= e.umbral_largo THEN 1 ELSE 0 END) as viajes_medios,
        SUM(CASE WHEN t.TRIP_DURATION_MIN > e.umbral_largo THEN 1 ELSE 0 END) as viajes_largos
    FROM "{database}"."{target_schema}"."OBT_TRIPS" t
    JOIN estadisticas e ON t.PICKUP_BOROUGH = e.PICKUP_BOROUGH AND t.MONTH = e.MONTH
    WHERE t.PICKUP_BOROUGH IS NOT NULL
    GROUP BY t.PICKUP_BOROUGH, t.MONTH
    ORDER BY t.PICKUP_BOROUGH, t.MONTH
    """,
    
    "o_diferencias_vendor": f"""
    SELECT 
        VENDORID,
        COUNT(*) as total_viajes,
        AVG(AVG_SPEED_MPH) as velocidad_promedio_mph,
        AVG(TRIP_DURATION_MIN) as duracion_promedio_min,
        AVG(TRIP_DISTANCE) as distancia_promedio_millas,
        AVG(TOTAL_AMOUNT) as ticket_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE VENDORID IS NOT NULL
    GROUP BY VENDORID
    HAVING total_viajes > 1000
    ORDER BY total_viajes DESC
    """,
    
    "p_relacion_pago_tip_hora": f"""
    SELECT 
        PAYMENT_TYPE_DESC,
        HOUR(PICKUP_DATETIME) as hora_dia,
        COUNT(*) as total_viajes,
        AVG(TIP_AMOUNT) as tip_promedio,
        AVG(TIP_PCT) as tip_pct_promedio,
        AVG(TOTAL_AMOUNT) as ticket_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PAYMENT_TYPE_DESC IS NOT NULL
    GROUP BY PAYMENT_TYPE_DESC, hora_dia
    ORDER BY PAYMENT_TYPE_DESC, hora_dia
    """,
    
    "q_zonas_outliers_duracion_distancia": f"""
    WITH percentiles_zonas AS (
        SELECT 
            PICKUP_ZONE,
            PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY TRIP_DURATION_MIN) as p99_duracion,
            PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY TRIP_DISTANCE) as p99_distancia
        FROM "{database}"."{target_schema}"."OBT_TRIPS"
        WHERE PICKUP_ZONE IS NOT NULL
        GROUP BY PICKUP_ZONE
        HAVING COUNT(*) > 50
    )
    SELECT 
        t.PICKUP_ZONE,
        COUNT(*) as viajes_outliers,
        SUM(CASE WHEN t.TRIP_DURATION_MIN > p.p99_duracion THEN 1 ELSE 0 END) as outliers_duracion,
        SUM(CASE WHEN t.TRIP_DISTANCE > p.p99_distancia THEN 1 ELSE 0 END) as outliers_distancia,
        AVG(CASE WHEN t.TRIP_DURATION_MIN > p.p99_duracion THEN t.TRIP_DURATION_MIN END) as duracion_promedio_outliers,
        AVG(CASE WHEN t.TRIP_DISTANCE > p.p99_distancia THEN t.TRIP_DISTANCE END) as distancia_promedio_outliers
    FROM "{database}"."{target_schema}"."OBT_TRIPS" t
    JOIN percentiles_zonas p ON t.PICKUP_ZONE = p.PICKUP_ZONE
    WHERE t.TRIP_DURATION_MIN > p.p99_duracion OR t.TRIP_DISTANCE > p.p99_distancia
    GROUP BY t.PICKUP_ZONE
    ORDER BY viajes_outliers DESC
    LIMIT 20
    """,
    
    "r_yield_por_milla": f"""
    SELECT 
        PICKUP_BOROUGH,
        HOUR(PICKUP_DATETIME) as hora_dia,
        COUNT(*) as total_viajes,
        AVG(TOTAL_AMOUNT / NULLIF(TRIP_DISTANCE, 0)) as yield_por_milla,
        AVG(TOTAL_AMOUNT) as ticket_promedio,
        AVG(TRIP_DISTANCE) as distancia_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS"
    WHERE PICKUP_BOROUGH IS NOT NULL AND TRIP_DISTANCE > 0
    GROUP BY PICKUP_BOROUGH, hora_dia
    ORDER BY PICKUP_BOROUGH, hora_dia
    """,
    
    "s_cambios_yoy": f"""
    WITH datos_anuales AS (
        SELECT 
            SERVICE_TYPE,
            YEAR,
            COUNT(*) as volumen_viajes,
            AVG(TOTAL_AMOUNT) as ticket_promedio,
            SUM(TOTAL_AMOUNT) as ingreso_total
        FROM "{database}"."{target_schema}"."OBT_TRIPS"
        GROUP BY SERVICE_TYPE, YEAR
    )
    SELECT 
        SERVICE_TYPE,
        YEAR,
        volumen_viajes,
        ticket_promedio,
        ingreso_total,
        LAG(volumen_viajes) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR) as volumen_anio_anterior,
        LAG(ticket_promedio) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR) as ticket_anio_anterior,
        CASE 
            WHEN LAG(volumen_viajes) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR) IS NOT NULL 
            THEN (volumen_viajes - LAG(volumen_viajes) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR)) * 100.0 / LAG(volumen_viajes) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR)
            ELSE NULL
        END as crecimiento_volumen_pct,
        CASE 
            WHEN LAG(ticket_promedio) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR) IS NOT NULL 
            THEN (ticket_promedio - LAG(ticket_promedio) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR)) * 100.0 / LAG(ticket_promedio) OVER (PARTITION BY SERVICE_TYPE ORDER BY YEAR)
            ELSE NULL
        END as crecimiento_ticket_pct
    FROM datos_anuales
    ORDER BY SERVICE_TYPE, YEAR
    """,
    
    "t_impacto_alta_congestion": f"""
    WITH congestion_diaria AS (
        SELECT 
            DATE(PICKUP_DATETIME) as fecha,
            AVG(CONGESTION_SURCHARGE) as congestion_promedio,
            PERCENTILE_CONT(0.75) OVER (ORDER BY AVG(CONGESTION_SURCHARGE)) as p75_congestion
        FROM "{database}"."{target_schema}"."OBT_TRIPS"
        GROUP BY fecha
    )
    SELECT 
        CASE 
            WHEN c.congestion_promedio > c.p75_congestion THEN 'Alta Congestion'
            ELSE 'Normal'
        END as tipo_dia,
        COUNT(DISTINCT t.PICKUP_DATETIME::DATE) as dias_analizados,
        COUNT(*) as total_viajes,
        AVG(t.TOTAL_AMOUNT) as ticket_promedio,
        AVG(t.TRIP_DURATION_MIN) as duracion_promedio,
        AVG(t.CONGESTION_SURCHARGE) as congestion_promedio,
        AVG(t.TIP_PCT) as tip_pct_promedio
    FROM "{database}"."{target_schema}"."OBT_TRIPS" t
    JOIN congestion_diaria c ON DATE(t.PICKUP_DATETIME) = c.fecha
    GROUP BY tipo_dia
    ORDER BY tipo_dia
    """
}

print("Iniciando análisis completo de OBT_TRIPS...")

try:
    import snowflake.connector
    
    print("✅ snowflake-connector-python está instalado")
    
    # Extraer información de conexión
    account = sfOptions['sfURL']
    if '.snowflakecomputing.com' in account:
        account = account.split('.snowflakecomputing.com')[0]
    
    # Conectar a Snowflake
    print("Conectando a Snowflake...")
    conn = snowflake.connector.connect(
        user=sfOptions['sfUser'],
        password=sfOptions['sfPassword'],
        account=account,
        warehouse=sfOptions['sfWarehouse'],
        database=sfOptions['sfDatabase'],
        schema=target_schema,
        role=sfOptions['sfRole']
    )
    
    print("✅ Conexión establecida")
    
    # Crear cursor
    cursor = conn.cursor()
    
    # Ejecutar todos los análisis
    results = {}
    
    for query_name, query in analysis_queries.items():
        try:
            print(f"📊 Ejecutando: {query_name}")
            cursor.execute(query)
            
            # Obtener resultados
            df = cursor.fetch_pandas_all()
            results[query_name] = df
            print(f"✅ {len(df)} filas")
            
        except Exception as e:
            print(f"❌ Error en {query_name}: {str(e)}")
            results[query_name] = None
    
    # Cerrar conexión
    cursor.close()
    conn.close()
    
    from pathlib import Path
    from datetime import datetime
    import pandas as pd
    import re
    from IPython.display import FileLink, display

    # Guardar en el directorio de trabajo del notebook (servido por Jupyter)
    out_dir = Path.cwd() / "analisis_obt_trips"   # <-- aquí estará visible en JupyterLab Files
    out_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    excel_filepath = out_dir / f"analisis_obt_trips_{timestamp}.xlsx"

    # nombre de hoja seguro (31 chars y sin caracteres inválidos)
    def sanitize(name):
        name = re.sub(r'[:\\/?*\[\]]', '_', str(name))
        return name.strip()[:31] or "sheet"

    with pd.ExcelWriter(excel_filepath, engine='openpyxl') as writer:
        for query_name, df in results.items():
            if df is None or df.empty:
                continue
            sheet = sanitize(query_name)
            df.to_excel(writer, sheet_name=sheet, index=False)
            print("✅ Hoja creada:", sheet)

    # comprobaciones y link de descarga
    print("\nArchivo escrito en (ruta del contenedor/kernel):", excel_filepath.resolve())
    print("Listado del directorio:", out_dir)
    for p in out_dir.iterdir():
        print(" -", p.name, "(size:", p.stat().st_size, "bytes)")

    # Mostrar link para bajar por navegador (clic derecho -> Guardar enlace como)
    if excel_filepath.exists():
        display(FileLink(str(excel_filepath), result_html_prefix="Descargar archivo: "))
    else:
        print("❌ No se encontró el archivo después de escribirlo.")

    
except ImportError as e:
    print(f"❌ Error de importación: {e}")
    print("Instala las dependencias con: pip install snowflake-connector-python pandas openpyxl")
except Exception as e:
    print(f"❌ Error durante el análisis: {str(e)}")

Iniciando análisis completo de OBT_TRIPS...
✅ snowflake-connector-python está instalado
Conectando a Snowflake...
✅ Conexión establecida
📊 Ejecutando: a_top_10_pickup_zonas
✅ 1270 filas
📊 Ejecutando: b_top_10_dropoff_zonas
✅ 1270 filas
📊 Ejecutando: c_evolucion_mensual_borough
✅ 889 filas
📊 Ejecutando: d_ticket_promedio_service_mes
✅ 254 filas
📊 Ejecutando: e_viajes_hora_dia_semana
✅ 168 filas
📊 Ejecutando: f_p50_p90_duracion_borough
❌ Error en f_p50_p90_duracion_borough: 002070 (42601): SQL compilation error:
Function APPROX_PERCENTILE does not support WITHIN GROUP clause.
📊 Ejecutando: g_velocidad_franja_horaria
✅ 14 filas
📊 Ejecutando: h_participacion_payment_type
✅ 5 filas
📊 Ejecutando: i_rate_code_metrics
✅ 7 filas
📊 Ejecutando: j_mix_yellow_green
✅ 1765 filas
📊 Ejecutando: k_top_20_flujos_pu_do
✅ 20 filas
📊 Ejecutando: l_distribucion_passenger_count
✅ 14 filas
📊 Ejecutando: m_impacto_tolls_congestion
✅ 20 filas
📊 Ejecutando: n_proporcion_viajes_cortos_largos
✅ 84 filas
📊 Ejecutan

In [24]:
import shutil
from pathlib import Path
from IPython.display import FileLink, display

folder = Path("/home/jovyan/analisis_obt_trips")
zip_path = Path("/home/jovyan/analisis_obt_trips.zip")
shutil.make_archive(str(zip_path.with_suffix('')), 'zip', folder)
display(FileLink(str(zip_path), result_html_prefix="Descargar ZIP: "))

