In [89]:
import pandas as pd
import numpy as np
df = pd.read_csv('processed_data_davi.csv', dtype={
    "ICAO 24-bit code": 'str',
    "callsign": 'str',
    "origin country": 'str',
    "time at position": 'str',
    "time of last contact": 'str',
    "longitude": 'str',
    "latitude": 'str',
    "barometric altitude": 'float',
    "aircraft is grounded": 'bool',
    "velocity": 'float',
    "heading": 'float',
    "vertical rate": 'float',
    "geo_altitude": 'float',
    "squawk": 'float',
    "spi": 'bool',
    "latitude, longitude": 'str',
    "location": 'str',
    "country": 'str',
    "oblast": 'str',
}, date_parser = pd.to_datetime,
parse_dates=['time at position', 'time of last contact'])

In [90]:
df.latitude = round(df.latitude.astype(float),4).astype(str)

In [91]:
df.longitude = round(df.longitude.astype(float),4).astype(str)

In [92]:
df.country = df.country[df.country.str[0] == " "].str[1:]

In [93]:
df['date'] = df['time at position'].apply(lambda x: str(x.date()))
df['flight-id'] = df['ICAO 24-bit code'] + df['callsign'] + df['date']

In [99]:
def gen_squawk(X):
    if 7500.0 in X:
        return '7500'
    if 7700.0 in X:
        return '7700'
    else:
        return 'other'

In [100]:
def is_in_ukraine(x):
    if 'Ukraine' in x:
        return True
    else:
        return False

In [101]:
df['was_in_ukraine'] = df['country']

In [94]:
df.drop(columns=['oblast'])
def get_state(x):
    try:
        x = x.split(',')
        if len(x) == 0:
            return np.nan
        if len(x) == 1:
            return np.nan
        if x[-1] == " Kosovo" or x[-1] == "Kosovo":
            return "Kosovo"
        if x[-1] == " Russia" or x[-1] == "Russia":
            o = [e for e in x if ('Oblast' in e or 'Krai' in e)]
            return o[-1] if len(o) > 0 else np.nan
        if len(x) == 2:
            return x[0]
        if not any(char.isdigit() for char in x[-2]):
            return x[-2]
        return x[-3]
    except:
        return np.nan
df.oblast = df.location.apply(lambda x: get_state(x))

In [95]:
df.loc[df.oblast.str[0] == " ", "oblast"] = df.oblast[df.oblast.str[0] == " "].str[1:]

In [96]:
def fixState(x):
    if (x == "Republic of Crimea"):
        return "Autonomous Republic of Crimea"
    return x
df.oblast = df.oblast.apply(fixState)

In [97]:
df[df.oblast.isna() & ~df.location.isna()][['country','oblast','location']].to_csv("nulls.csv")

In [98]:
df['latitude, longitude'] = df[['latitude', 'longitude']].values.tolist()

In [103]:
gbdf = df.sort_values(by='time at position').groupby(['flight-id','origin country','date'], dropna=False).agg({
    "barometric altitude": np.max,
    "velocity": np.max,
    "vertical rate": np.max,
    "oblast": lambda x: set(x),
    "country": lambda x: set(x),
    "latitude, longitude": lambda x: list(x),
    "spi": lambda x: any(x),
    "squawk": lambda x: gen_squawk(set(x)),
    "was_in_ukraine": lambda x: is_in_ukraine(set(x))
})

In [104]:
df.to_csv('all_data_davi.csv', index=False) 

In [105]:
df[df.country == 'Ukraine'].to_csv('ukraine_data_davi.csv', index=False) 

In [106]:
gbdf.to_csv("flights_separate.csv")