In [65]:
import os
import requests
from datetime import datetime, timedelta
import pandas as pd
import json
from google.transit import gtfs_realtime_pb2


FUENTES = {
    "ACES": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-ace",
        "lineas": ["A", "C", "E", "Sr"]
    },
    "BDFMS": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-bdfm",
        "lineas": ["B", "D", "F", "M", "Sf"]
    },
    "G": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-g",
        "lineas": ["G"]
    },
    "JZ": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-jz",
        "lineas": ["J", "Z"]
    },
    "NQRW": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-nqrw",
        "lineas": ["N", "Q", "R", "W"]
    },
    "L": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-l",
        "lineas": ["L"]
    },
    "1234567S": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs",
        "lineas": ["1", "2", "3", "4", "5", "6", "7", "S"]
    },
    "SIR": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-si",
        "lineas": ["SIR"]
    }
}

In [66]:
def extraccion_linea(url, linea):
    """
    Extrae los datos de una línea
    """
    response = requests.get(url)
    fuentes = gtfs_realtime_pb2.FeedMessage()
    fuentes.ParseFromString(response.content)

    datos_linea = []
    for entity in fuentes.entity:
        if entity.HasField('trip_update'):
            trayecto = entity.trip_update

            if trayecto.trip.route_id == linea:
                for stop in trayecto.stop_time_update:
                    campos = {
                        'viaje_id': trayecto.trip.trip_id,
                        'linea_id': trayecto.trip.route_id,
                        'hora_inicio': trayecto.trip.start_time if trayecto.trip.HasField('start_date') else None,
                        'fecha_inicio': trayecto.trip.start_date if trayecto.trip.HasField('start_time') else None,
                        'direccion': trayecto.trip.direction_id if trayecto.trip.HasField('direction_id') else None,
                        'parada_id': stop.stop_id,
                        'orden_parada': stop.stop_sequence if stop.HasField('stop_sequence') else None,
                        'hora_llegada': datetime.fromtimestamp(stop.arrival.time) if stop.HasField('arrival') else None,
                        'retraso_llegada': stop.arrival.delay if stop.HasField('arrival') and stop.arrival.HasField('delay') else None,
                        'hora_partida': datetime.fromtimestamp(stop.departure.time) if stop.HasField('departure') else None,
                        'retraso_partida': stop.departure.delay if stop.HasField('departure') and stop.departure.HasField('delay') else None,                       
                    }

                    datos_linea.append(campos)
    return datos_linea


In [67]:
def extraccion_datos():
    """
    Extrae todas las líneas y las convierte a DF
    """

    todas_las_lineas = []
    for info in FUENTES.values():
        todas_las_lineas.extend(info['lineas'])
    
    todos_los_datos = []
    for linea in todas_las_lineas:
        for grupo, info in FUENTES.items():
            if linea in info['lineas']:
                fuentes_url = info['url']
            todos_los_datos.extend(extraccion_linea(fuentes_url, linea))  

    return pd.DataFrame(todos_los_datos)
        

In [68]:
df = extraccion_datos()

In [69]:
df

Unnamed: 0,viaje_id,linea_id,hora_inicio,fecha_inicio,direccion,parada_id,orden_parada,hora_llegada,retraso_llegada,hora_partida,retraso_partida
0,050600_A..S58X054,A,08:26:00,20260216,,H10S,,2026-02-16 16:04:57,,2026-02-16 16:04:57,
1,050600_A..S58X054,A,08:26:00,20260216,,H11S,,2026-02-16 16:06:10,,2026-02-16 16:06:10,
2,051100_A..N55R,A,08:31:00,20260216,,A05N,,2026-02-16 16:06:10,,2026-02-16 16:06:10,
3,051100_A..N55R,A,08:31:00,20260216,,A03N,,2026-02-16 16:10:40,,2026-02-16 16:10:40,
4,051100_A..N55R,A,08:31:00,20260216,,A02N,,2026-02-16 16:14:40,,2026-02-16 16:14:40,
...,...,...,...,...,...,...,...,...,...,...,...
71316,064250_7..N,7,,,,706N,,2026-02-16 17:12:40,,2026-02-16 17:13:00,
71317,064250_7..N,7,,,,705N,,2026-02-16 17:13:40,,2026-02-16 17:14:00,
71318,064250_7..N,7,,,,702N,,2026-02-16 17:16:10,,2026-02-16 17:16:30,
71319,064250_7..N,7,,,,701N,,2026-02-16 17:18:40,,NaT,


In [70]:
tot_filas = len(df)

for columna in df.columns:
    nulos = df[columna].isnull().sum()
    proporcion = nulos/tot_filas
    print(f"Proporcion nulos en {columna}: {proporcion}")


Proporcion nulos en viaje_id: 0.0
Proporcion nulos en linea_id: 0.0
Proporcion nulos en hora_inicio: 0.0
Proporcion nulos en fecha_inicio: 0.3759341568402013
Proporcion nulos en direccion: 0.9803984801110472
Proporcion nulos en parada_id: 0.0
Proporcion nulos en orden_parada: 0.9803984801110472
Proporcion nulos en hora_llegada: 0.006926431205395325
Proporcion nulos en retraso_llegada: 0.9808611769324603
Proporcion nulos en hora_partida: 0.01775073260330057
Proporcion nulos en retraso_partida: 0.9814921271434781


In [71]:
df = df.drop(['direccion', 'orden_parada', 'retraso_llegada', 'retraso_partida', 'hora_inicio', 'fecha_inicio'], axis = 1)

In [72]:
df['hora_llegada'] = df['hora_llegada'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')  
df['hora_partida'] = df['hora_partida'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')

In [73]:
df['hora_llegada'] = df['hora_llegada'].dt.strftime('%H:%M')
df['hora_partida'] = df['hora_partida'].dt.strftime('%H:%M')

In [74]:
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida
0,050600_A..S58X054,A,H10S,11:04,11:04
1,050600_A..S58X054,A,H11S,11:06,11:06
2,051100_A..N55R,A,A05N,11:06,11:06
3,051100_A..N55R,A,A03N,11:10,11:10
4,051100_A..N55R,A,A02N,11:14,11:14
...,...,...,...,...,...
71316,064250_7..N,7,706N,12:12,12:13
71317,064250_7..N,7,705N,12:13,12:14
71318,064250_7..N,7,702N,12:16,12:16
71319,064250_7..N,7,701N,12:18,


In [75]:
norte = (df['parada_id'].str[-1] == 'N')
sur = (df['parada_id'].str[-1] == 'S')

df.loc[norte, 'direccion'] = 1
df.loc[sur, 'direccion'] = 0

df['direccion'] = df['direccion'].astype('Int64')


In [76]:
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida,direccion
0,050600_A..S58X054,A,H10S,11:04,11:04,0
1,050600_A..S58X054,A,H11S,11:06,11:06,0
2,051100_A..N55R,A,A05N,11:06,11:06,1
3,051100_A..N55R,A,A03N,11:10,11:10,1
4,051100_A..N55R,A,A02N,11:14,11:14,1
...,...,...,...,...,...,...
71316,064250_7..N,7,706N,12:12,12:13,1
71317,064250_7..N,7,705N,12:13,12:14,1
71318,064250_7..N,7,702N,12:16,12:16,1
71319,064250_7..N,7,701N,12:18,,1


In [77]:
tot_filas = len(df)

for columna in df.columns:
    nulos = df[columna].isnull().sum()
    proporcion = nulos/tot_filas
    print(f"Proporcion nulos en {columna}: {proporcion}")

Proporcion nulos en viaje_id: 0.0
Proporcion nulos en linea_id: 0.0
Proporcion nulos en parada_id: 0.0
Proporcion nulos en hora_llegada: 0.006926431205395325
Proporcion nulos en hora_partida: 0.01775073260330057
Proporcion nulos en direccion: 0.0


In [79]:
df = df.dropna()
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida,direccion
0,050600_A..S58X054,A,H10S,11:04,11:04,0
1,050600_A..S58X054,A,H11S,11:06,11:06,0
2,051100_A..N55R,A,A05N,11:06,11:06,1
3,051100_A..N55R,A,A03N,11:10,11:10,1
4,051100_A..N55R,A,A02N,11:14,11:14,1
...,...,...,...,...,...,...
71314,064250_7..N,7,708N,12:10,12:10,1
71315,064250_7..N,7,707N,12:11,12:12,1
71316,064250_7..N,7,706N,12:12,12:13,1
71317,064250_7..N,7,705N,12:13,12:14,1
