In [1]:
import tarfile
import pandas as pd

In [2]:
tar_path = "202412010000_202412072359.tar"

# Extraer el CSV sin descomprimir todo
with tarfile.open(tar_path, "r") as tar:
    csv_filename = [m.name for m in tar.getmembers() if m.name.endswith(".csv")][0]  # Buscar el CSV dentro del .tar
    csv_file = tar.extractfile('202412010000_202412072359.csv')  # Extraer solo este archivo

    # Leer el CSV en partes grandes (para eficiencia)
    chunksize = 500_000  # Número de filas por chunk (ajústalo según la memoria disponible)
    reader = pd.read_csv(csv_file, chunksize=chunksize, delimiter=";")

    # Inspeccionar las primeras filas del primer chunk
    first_chunk = next(reader)
    print(first_chunk.head())

        ts_kafka               message  Unnamed: 2
0  1733011203260  jUoZIupCiFgBPAjykJ4=         NaN
1  1733011203260  oAAUllilYmBIYPDDtec=         NaN
2  1733011203260  oAAZEMMJ5TEgP//kyUg=         NaN
3  1733011203260  jQIBK/gjAAIASbh2gt8=         NaN
4  1733011203260          XTRCE1QV9w==         NaN


In [3]:
first_chunk = first_chunk.drop(columns=['Unnamed: 2'])
first_chunk

Unnamed: 0,ts_kafka,message
0,1733011203260,jUoZIupCiFgBPAjykJ4=
1,1733011203260,oAAUllilYmBIYPDDtec=
2,1733011203260,oAAZEMMJ5TEgP//kyUg=
3,1733011203260,jQIBK/gjAAIASbh2gt8=
4,1733011203260,XTRCE1QV9w==
...,...,...
499995,1733016190658,kDQjSygABh8KjF3Tsjo=
499996,1733016190658,oQABmSAAAAAAAAD6das=
499997,1733016190658,kTQ0CUAQCAFwbrhWG3M=
499998,1733016190658,jDRBFeEfvwAAAADWOes=


In [4]:
from preprocess.decoder import Decoder

df_resultado = first_chunk.apply(lambda x: Decoder.processMessage(x['message'], x['ts_kafka']), axis=1).apply(pd.Series)


In [7]:
df_resultado.loc[df_resultado['Flight status'] != 'NaN'].head()

Unnamed: 0,Timestamp (kafka),Timestamp (date),Message (base64),Message (hex),ICAO,Downlink Format,Flight status,Typecode,TurbulenceCategory,BDS,...,Turbulence level (0-3),Wind shear level (0-3),Microburst level (0-3),Icing level (0-3),Wake vortex level (0-3),Static air temperature (C),Average static pressure (hPa),Radio height (ft),Wind speed (kt) and direction (true) (deg),Humidity (%)
0,1733011203260,2024-12-01 01:00:03.260,jUoZIupCiFgBPAjykJ4=,8D4A1922EA428858013C08F2909E,4A1922,17,,29.0,,,...,,,,,,,,,,
1,1733011203260,2024-12-01 01:00:03.260,oAAUllilYmBIYPDDtec=,A000149658A562604860F0C3B5E7,34640E,20,airborne,,,,...,,,,,,,,,,
2,1733011203260,2024-12-01 01:00:03.260,oAAZEMMJ5TEgP//kyUg=,A0001910C309E531203FFFE4C948,4CADA4,20,,,,BDS60,...,,,,,,,,,,
3,1733011203260,2024-12-01 01:00:03.260,jQIBK/gjAAIASbh2gt8=,8D02012BF82300020049B87682DF,02012B,17,,31.0,,,...,,,,,,,,,,
4,1733011203260,2024-12-01 01:00:03.260,XTRCE1QV9w==,5D3442135415F7,344213,11,,,,,...,,,,,,,,,,


In [5]:
import pandas as pd
from airstrip.airplane import Airplane

df_resultado['Timestamp (date)'] = pd.to_datetime(df_resultado['Timestamp (date)'])

airplanes = {}

# Iteramos sobre los mensajes del DataFrame
for _, row in df_resultado.iterrows():
    icao = row.get("ICAO")
    status = row.get("Flight status")
    timestamp = row.get("Timestamp (date)")

    if pd.isna(icao) or pd.isna(status):
        continue

    if icao not in airplanes:
        airplanes[icao] = Airplane(icao)

    airplanes[icao].update_flight_status(status, timestamp)

# Obtenemos el timestamp final
final_timestamp = df_resultado['Timestamp (date)'].max()

# Finalizamos el cálculo del tiempo total
total_times = []
for icao, airplane in airplanes.items():
    airplane.finalize(final_timestamp)
    total_times.append({
        "ICAO": icao,
        "Tiempo total en tierra (s)": airplane.total_ground_time,
        "Tiempo total en vuelo (s)": airplane.total_airborne_time
    })

# Convertimos los resultados en un DataFrame
result_df = pd.DataFrame(total_times)
print(result_df)


      ICAO  Tiempo total en tierra (s)  Tiempo total en vuelo (s)
0   34640E                    4720.597                    266.801
1   4CADA4                      12.481                   4974.917
2   344213                     175.026                   4812.372
3   34510A                      47.383                   4940.015
4   02012B                       2.028                   4985.369
5   342344                    4987.397                      0.000
6   342352                    4987.397                      0.000
7   343650                    4759.109                    228.288
8   347519                     159.544                   4827.853
9   342345                    4987.397                      0.000
10  342346                    4987.397                      0.000
11  34108F                    4987.397                      0.000
12  344115                    4411.542                    575.855
13  342355                    1398.878                   3588.519
14  502D5E

In [10]:
df_resultado.dtypes

Timestamp (kafka)                                      int64
Timestamp (date)                              datetime64[ns]
Message (base64)                                      object
Message (hex)                                         object
ICAO                                                  object
Downlink Format                                        int64
Flight status                                         object
Typecode                                             float64
TurbulenceCategory                                    object
BDS                                                   object
Altitude (ft)                                        float64
Magnetic heading (deg)                               float64
Indicated airspeed (kt)                              float64
Mach number (-)                                      float64
Barometric altitude rate (ft/min)                    float64
Inertial vertical speed (ft/min)                     float64
Velocity                

In [None]:
df_resultado.to_csv('first_chunk.csv')

Estoy trabajando sobre un dataframe en pandas, que contiene mensajes de distinto tipo de informacion sobre aviones. Quiero saber cunato tiempo han estado los aviones en aire y cuanto en suelo. Para ello me interesan los mensajes que contienen informacion sobre el estado del avion (Flight status). Y con el siguiente codigo calculo los tiempos:

In [8]:
# Separar los valores de la columna Velocity en múltiples columnas
velocity_cols = ['Velocity_Speed', 'Velocity_Heading', 'Velocity_Altitude', 'Velocity_Type']
df_velocity = df_resultado['Velocity'].apply(pd.Series, index=velocity_cols)


# Unir las columnas separadas con el DataFrame original
df_resultado = pd.concat([df_resultado, df_velocity], axis=1)

# Eliminar la columna original de Velocity
df_resultado = df_resultado.drop(columns=['Velocity'])
df_resultado.to_parquet('first_chunk.parquet')

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.