# Assembling the json files into a better format

In [2]:
# Iterate in the data/ folder with glob
import os
import pandas as pd
import numpy as np
import json
import pickle
import shutil
from tqdm import tqdm

def get_longest_paths(root):
    paths = []
    for root, dirs, files in os.walk(root):
        for name in dirs:
            paths.append(os.path.join(root, name))

    paths = [path for path in paths if 'consolidated' not in path]
    max_length = max(len(path) for path in paths)
    longest_paths = [path for path in paths if len(path) == max_length]

    return longest_paths

def get_file_names(path):
    return [name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))]

def extract_timestamp(filepath):
    # Split the filepath into parts
    parts = filepath.split('/')
    
    # Extract the date and time parts
    date_parts = parts[-4:-1]  # ['2020', '03', '25']
    time_part = parts[-1].split('.')[0]  # '013200Z'
    
    # Combine the date and time parts into a timestamp
    timestamp_str = '-'.join(date_parts) + ' ' + time_part[:-1]
    
    # Convert the timestamp string to a pandas Timestamp object
    timestamp = pd.to_datetime(timestamp_str, format='%Y-%m-%d %H%M%S')
    
    return timestamp

def save_file(filepath, data:pd.DataFrame) -> None:
    with open(filepath, 'wb') as file:
        pickle.dump(data, file)

def load_file(filepath) -> pd.DataFrame:
    with open(filepath, 'rb') as file:
        data = pickle.load(file)
    return data

def clean_root(root_dir):
    dirs = os.listdir(root_dir)
    for dir in dirs:
        if dir != 'consolidated':
            shutil.rmtree(os.path.join(root_dir, dir))

# Call the function
longest_paths = get_longest_paths('data/')

In [3]:
def create_df(data):
    df = pd.json_normalize(data['aircraft'])
    df['timestamp'] = pd.to_datetime(data['now'], unit='s')
    df.set_index('timestamp', inplace=True)
    df = df.replace({None: np.nan})
    # df = df.astype({
    #     'flight': 'object',
    #     'type': 'object',
    #     'hex': 'object',
    #     'r': 'object',
    #     't': 'object',
    #     'gs': 'float64',
    #     'track': 'float64',
    #     'baro_rate': 'float64',
    #     'alt_geom': 'float64'
    # })
    def convert_alt_baro(value):
        try:
            return float(value)
        except (TypeError, ValueError):
            return float(-1)

    # Apply the conversion function
    df['alt_baro'] = df['alt_baro'].apply(convert_alt_baro)
    df = df.sort_index()
    return df

In [4]:
def logic(root):
    longest_paths = get_longest_paths(root)
    for path in tqdm(sorted(longest_paths), desc='Processing ...'):
        file_names = get_file_names(path)
        dfs = []
        # for file in sorted(file_names):
        for file in sorted(file_names):
            with open(os.path.join(path, file), 'r') as f:
                data = json.load(f)
            dfs.append(create_df(data))
        df = pd.concat(dfs)
        # Saving df to hdf file
        timestamp = df.iloc[0].name
        year = str(timestamp.year)
        month = str(timestamp.month).zfill(2)
        os.makedirs(f"data/consolidated/{year}", exist_ok=True)
        f_name = f"data/consolidated/{year}/{month}.pkl"

        with open(f_name, 'wb') as f:
            pickle.dump(df, f)


In [57]:
logic('data')

Processing ...:   0%|          | 0/42 [00:03<?, ?it/s]


Alt_baro -> float64
squawk -> int
lat -> float64
lon -> float64
seen_pos -> timestamp


In [5]:
import os
import json
import pandas as pd
path = 'data/2021/01/01'
for root, dirs, file_names in os.walk(path):
    dfs = []
    for file in sorted(file_names):
        with open(os.path.join(path, file), 'r') as f:
            data = json.load(f)
        dfs.append(create_df(data))
        df = pd.concat(dfs)
