In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import pickle

In [2]:
# df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset.pkl'))
b1_p1_df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_nnet_1st_batch_p1.pkl'))
b1_p2_df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_nnet_1st_batch_p2.pkl'))
b2_df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_2nd_batch.pkl'))

In [3]:
print(f'Columns in b1_p1_df: {b1_p1_df.columns}')
print(f'Columns in b1_p2_df: {b1_p2_df.columns}')
print(f'Difference between columns in b1_p1_df and b1_p2_df: {set(b1_p1_df.columns) - set(b1_p2_df.columns)}')


Columns in b1_p1_df: Index(['image_filename', 'absolute_time_GMT_seconds', 'relative_time_seconds',
       'estimated_distance_to_centerline_meters',
       'actual_distance_to_centerline_meters',
       'actual_distance_to_centerline_NORMALIZED',
       'estimated_heading_error_degrees', 'actual_heading_error_degrees',
       'actual_heading_error_NORMALIZED', 'downtrack_position_meters',
       'downtrack_position_NORMALIZED', 'period_of_day', 'cloud_type',
       'job_id', 'step_num', 'section', 'model_type', 'episode',
       'ep_start_cte_act', 'ep_start_he_act', 'sec_start_cte_act',
       'sec_start_he_act'],
      dtype='object')
Columns in b1_p2_df: Index(['image_filename', 'absolute_time_GMT_seconds', 'relative_time_seconds',
       'estimated_distance_to_centerline_meters',
       'actual_distance_to_centerline_meters',
       'actual_distance_to_centerline_NORMALIZED',
       'estimated_heading_error_degrees', 'actual_heading_error_degrees',
       'actual_heading_error_NOR

In [4]:
print(f'Number of unique episode_num in b1_p1_df: {b1_p1_df["episode"].nunique()}')
print(f'Number of unique episode_num in b1_p2_df: {b1_p2_df["episode_num"].nunique()}')
print(f'Number of unique episode_num in b2_df: {b2_df["episode_num"].nunique()}')


Number of unique episode_num in b1_p1_df: 797
Number of unique episode_num in b1_p2_df: 499
Number of unique episode_num in b2_df: 997


In [5]:
if b1_p1_df['episode'].between(1, 797).all():
    print('All episodes are between 1 and 797')
else:
    print('Some episodes are outside the range of 1 and 797')


All episodes are between 1 and 797


In [6]:
b1_p2_df.shape

(109291, 17)

In [7]:
# rename episode_num column in b1_p2_df to job_id
b1_p2_df.rename(columns={'episode_num': 'job_id'}, inplace=True)

# create an episode column where it starts from 1 and increments by 1 for each new job_id
b1_p2_df['episode'] = b1_p2_df['job_id'].rank(method='dense').astype(int)

In [42]:
797+499

1296

In [8]:
# offset the episode column by 797
b1_p2_df['episode'] = b1_p2_df['episode'] + 797

if b1_p2_df['episode'].between(798, 1296).all():
    print('All episodes are between 798 and 1296')
else:
    print('Some episodes are outside the range of 798 and 1296')

All episodes are between 798 and 1296


In [10]:
# rename episode_num column in b1_p2_df to job_id
b2_df.rename(columns={'episode_num': 'job_id'}, inplace=True)

# create an episode column where it starts from 1 and increments by 1 for each new job_id
b2_df['episode'] = b2_df['job_id'].rank(method='dense').astype(int)

In [45]:
1296 + 997

2293

In [11]:
# offset the episode column by 1296
b2_df['episode'] = b2_df['episode'] + 1296

if b2_df['episode'].between(1296, 2293).all():
    print('All episodes are between 1296 and 2293')
else:
    print('Some episodes are outside the range of 1296 and 2293')

All episodes are between 1296 and 2293


In [12]:
# add ep_start_cte_act and ep_start_he_act columns
# these are the actual_distance_to_centerline_meters and actual_heading_error_degrees values at the start of each episode
b1_p2_df['ep_start_cte_act'] = b1_p2_df.groupby('episode')['actual_distance_to_centerline_meters'].transform('first')
b1_p2_df['ep_start_he_act'] = b1_p2_df.groupby('episode')['actual_heading_error_degrees'].transform('first')

# do the same for each section in each episode
# rename section_num to section
b1_p2_df = b1_p2_df.rename(columns={'section_num': 'section'})
b1_p2_df['section_start_cte_act'] = b1_p2_df.groupby(['episode', 'section'])['actual_distance_to_centerline_meters'].transform('first')
b1_p2_df['section_start_he_act'] = b1_p2_df.groupby(['episode', 'section'])['actual_heading_error_degrees'].transform('first')

# rename section_start_cte_act and section_start_he_act to sec_start_cte_act and sec_start_he_act
b1_p2_df = b1_p2_df.rename(columns={'section_start_cte_act': 'sec_start_cte_act', 'section_start_he_act': 'sec_start_he_act'})

b1_p2_df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_night_cirrus_500_0,40944.414062,0.070312,-4.975625,1.312110,0.131211,4.294211,4.691844,0.156395,149.288206,...,1,500,0,0,nnet,798,1.312110,4.691844,1.312110,4.691844
1,MWH_Runway04_night_cirrus_500_1,40945.441406,1.097656,-4.251509,1.863198,0.186320,10.024249,5.930713,0.197690,154.650466,...,1,500,1,0,nnet,798,1.312110,4.691844,1.312110,4.691844
2,MWH_Runway04_night_cirrus_500_2,40946.496094,2.152344,-2.417549,2.506637,0.250664,11.961238,6.316714,0.210557,160.038134,...,1,500,2,0,nnet,798,1.312110,4.691844,1.312110,4.691844
3,MWH_Runway04_night_cirrus_500_3,40947.519531,3.175781,-1.057059,3.119214,0.311921,6.168364,4.711604,0.157053,165.723392,...,1,500,3,0,nnet,798,1.312110,4.691844,1.312110,4.691844
4,MWH_Runway04_night_cirrus_500_4,40948.535156,4.191406,-1.990558,3.551921,0.355192,7.549158,2.804450,0.093482,171.531987,...,1,500,4,0,nnet,798,1.312110,4.691844,1.312110,4.691844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167661,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,0,999,209,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308
167662,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,0,999,210,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308
167663,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,0,999,211,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308
167664,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,0,999,212,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308


In [13]:
# add ep_start_cte_act and ep_start_he_act columns
# these are the actual_distance_to_centerline_meters and actual_heading_error_degrees values at the start of each episode
b2_df['ep_start_cte_act'] = b2_df.groupby('episode')['actual_distance_to_centerline_meters'].transform('first')
b2_df['ep_start_he_act'] = b2_df.groupby('episode')['actual_heading_error_degrees'].transform('first')

# do the same for each section in each episode
# rename section_num to section
b2_df = b2_df.rename(columns={'section_num': 'section'})
b2_df['section_start_cte_act'] = b2_df.groupby(['episode', 'section'])['actual_distance_to_centerline_meters'].transform('first')
b2_df['section_start_he_act'] = b2_df.groupby(['episode', 'section'])['actual_heading_error_degrees'].transform('first')

# rename section_start_cte_act and section_start_he_act to sec_start_cte_act and sec_start_he_act
b2_df = b2_df.rename(columns={'section_start_cte_act': 'sec_start_cte_act', 'section_start_he_act': 'sec_start_he_act'})

b2_df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,1,0,0,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,1,0,1,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,1,0,2,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,1,0,3,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,1,0,4,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,0,999,214,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,0,999,215,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,0,999,216,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,0,999,217,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670


In [67]:
b1_p1_df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,0,0,0,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,0,0,1,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,0,0,2,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,0,0,3,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,0,0,4,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174433,MWH_Runway04_afternoon_broken_800_212,2544.641846,220.941650,-0.273577,-1.813623,-0.181362,2.566364,-1.562173,-0.052072,1314.666709,...,3,800,212,3,nnet,797,2.192799,5.911479,-2.287287,0.222896
174434,MWH_Runway04_afternoon_broken_800_213,2545.691895,221.991699,-0.298727,-1.961441,-0.196144,0.624838,-2.581139,-0.086038,1320.460157,...,3,800,213,3,nnet,797,2.192799,5.911479,-2.287287,0.222896
174435,MWH_Runway04_afternoon_broken_800_214,2546.757324,223.057129,-1.084329,-2.186303,-0.218630,-1.580742,-3.104577,-0.103486,1326.287263,...,3,800,214,3,nnet,797,2.192799,5.911479,-2.287287,0.222896
174436,MWH_Runway04_afternoon_broken_800_215,2547.810791,224.110596,-1.193321,-2.422252,-0.242225,-8.462737,-2.563973,-0.085466,1331.995826,...,3,800,215,3,nnet,797,2.192799,5.911479,-2.287287,0.222896


In [14]:
# Concatenate b1_p1_df and b1_p2_df
b1_df = pd.concat([b1_p1_df, b1_p2_df], axis=0)
b1_df = b1_df.reset_index(drop=True)
b1_df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,0,0,0,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,0,0,1,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,0,0,2,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,0,0,3,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,0,0,4,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283724,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,0,999,209,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308
283725,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,0,999,210,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308
283726,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,0,999,211,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308
283727,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,0,999,212,3,nnet,1296,-0.981114,-3.840276,-1.598571,-1.857308


In [18]:
pd.to_pickle(b1_p1_df, Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p1_processed_v1.pkl'))
pd.to_pickle(b1_p2_df, Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p2_processed_v1.pkl'))
pd.to_pickle(b2_df, Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_2nd_batch_processed_v1.pkl'))

In [17]:
pd.to_pickle(b1_df, Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_processed_v1.pkl'))

In [89]:
#    'estimated_distance_to_centerline_meters',
#    'actual_distance_to_centerline_meters',
#    'actual_distance_to_centerline_NORMALIZED',
#    'estimated_heading_error_degrees', 'actual_heading_error_degrees',
#    'actual_heading_error_NORMALIZED', 'downtrack_position_meters',
#    'downtrack_position_NORMALIZED', 'period_of_day', 'cloud_type',
#    'job_id', 'step_num', 'section', 'model_type', 'episode',
#    'ep_start_cte_act', 'ep_start_he_act', 'sec_start_cte_act',
#    'sec_start_he_act'
groups = b1_df.groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])
num_groups = len(groups)
num_episodes = b1_df['episode'].nunique()

if num_groups == num_episodes:
    print('The number of groups is the same as the number of episodes')
else:
    print(f'The number of groups ({num_groups}) is different from the number of episodes{num_episodes}')

# # print the episode numbers for the groups with more than 1 episode
# groups.filter(lambda x: len(x) > 1)['episode'].unique()
# list the duplicate episodes for each group separately
groups.filter(lambda x: len(x) > 1).groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])['episode'].unique()
# groups.filter(lambda x: len(x) > 1)['episode'].unique().tolist()




The number of groups (1003) is different from the number of episodes1296


ep_start_he_act  ep_start_cte_act  period_of_day  cloud_type
-9.989648         1.279774         2              2                  [1246]
-9.969735        -1.191633         1              3             [766, 1066]
-9.949853         1.505195         1              2                   [164]
-9.929757        -3.000805         0              2                   [185]
-9.909875         2.595605         2              0              [591, 891]
                                                                   ...     
 9.911906         2.700592         1              0             [725, 1025]
 9.931655         1.566070         2              1                   [419]
 9.951785         2.931332         1              1              [698, 998]
 9.971538        -2.389701         2              1                   [416]
 9.991641        -1.307170         0              1                  [1275]
Name: episode, Length: 1003, dtype: object

In [93]:
groups = b2_df.groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])
num_groups = len(groups)
num_episodes = b2_df['episode'].nunique()

if num_groups == num_episodes:
    print('The number of groups is the same as the number of episodes')
else:
    print(f'The number of groups ({num_groups}) is different from the number of episodes ({num_episodes})')

# groups.filter(lambda x: len(x) > 1)['episode'].unique().tolist()
groups.filter(lambda x: len(x) > 1).groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])['episode'].unique()


The number of groups (1004) is different from the number of episodes (997)


ep_start_he_act  ep_start_cte_act  period_of_day  cloud_type
-9.989545         2.322673         2              3             [1957]
-9.969777        -0.062972         0              3             [2113]
-9.949742         0.561647         2              1             [2075]
-9.929814         0.965683         2              3             [1869]
-9.909753        -1.140538         2              2             [1515]
                                                                 ...  
 9.911716         2.108512         2              1             [2267]
 9.932887        -2.583236         2              3             [2152]
 9.951980         2.812060         1              1             [1653]
 9.971828         1.505455         2              1             [1716]
 9.991740         2.412076         2              2             [2007]
Name: episode, Length: 1004, dtype: object

**Problem**: the number of groups, grouped by `groupby` is not the same as the number of groups (number of `episode`s) in the original data. They are grouped by `ep_start_he_act`, `ep_start_cte_act`, `period_of_day`, `cloud_type`.

In [3]:
df_b1_p1 = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p1_processed_v2.pkl'))
df_b1_p2 = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p2_processed_v2.pkl'))


(109291, 27)

In [15]:
df_b1_p2 = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p2_processed_v2.pkl'))

# remove the first 300 episodes from b1_p2_df

df_b1_p2 = df_b1_p2[df_b1_p2['episode'] > 300]
df_b1_p2['episode'] = df_b1_p2['episode'] - 300
df_b1_p2


Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,timestamp,cte_threshold,safety_metric_cte,period_of_day_first,cloud_type_first
65849,MWH_Runway04_afternoon_clear_801_0,82675.710938,0.085938,-3.319858,-1.904533,-0.190453,-7.052132,-8.169812,-0.272327,149.422238,...,1,-1.904533,-8.169812,-1.904533,-8.169812,1970-01-01 00:00:00,3.0,-1.095467,1,0
65850,MWH_Runway04_afternoon_clear_801_1,82676.734375,1.109375,-3.708376,-2.469782,-0.246978,-3.885455,-5.630379,-0.187679,154.557907,...,1,-1.904533,-8.169812,-1.904533,-8.169812,1970-01-01 00:00:01,3.0,-0.530218,1,0
65851,MWH_Runway04_afternoon_clear_801_2,82677.750000,2.125000,-5.582032,-2.753196,-0.275320,2.365741,-1.940961,-0.064699,159.662646,...,1,-1.904533,-8.169812,-1.904533,-8.169812,1970-01-01 00:00:02,3.0,-0.246804,1,0
65852,MWH_Runway04_afternoon_clear_801_3,82678.789062,3.164062,-5.775794,-2.718115,-0.271812,7.128173,1.256988,0.041900,164.993555,...,1,-1.904533,-8.169812,-1.904533,-8.169812,1970-01-01 00:00:03,3.0,-0.281885,1,0
65853,MWH_Runway04_afternoon_clear_801_4,82679.812500,4.187500,-4.554712,-2.447422,-0.244742,5.385534,3.186897,0.106230,170.250746,...,1,-1.904533,-8.169812,-1.904533,-8.169812,1970-01-01 00:00:04,3.0,-0.552578,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109286,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,199,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:29,3.0,-2.482921,1,0
109287,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,199,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:30,3.0,-2.449016,1,0
109288,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,199,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:31,3.0,-2.372977,1,0
109289,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,199,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:32,3.0,-2.277358,1,0


In [16]:
# concat b1_p1 and b1_p2 dataframes and reset their index.
# Ensure that the episode numbers are unique

num_episodes_b1_p1 = df_b1_p1['episode'].nunique()

df_b1_p2['episode'] = df_b1_p2['episode'] + num_episodes_b1_p1

df_b1 = pd.concat([df_b1_p1, df_b1_p2], axis=0)
df_b1 = df_b1.reset_index(drop=True)

df_b1


Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:00,3.0,-2.134874
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:01,3.0,-2.024196
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:02,3.0,-2.320128
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:03,3.0,-2.913312
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:04,3.0,-1.620443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217875,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:29,3.0,-2.482921
217876,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:30,3.0,-2.449016
217877,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:31,3.0,-2.372977
217878,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:32,3.0,-2.277358


In [20]:
df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p2_processed_v1.pkl'))

df['episode'] = df['episode']-797

df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_night_cirrus_500_0,40944.414062,0.070312,-4.975625,1.312110,0.131211,4.294211,4.691844,0.156395,149.288206,...,1,500,0,0,nnet,1,1.312110,4.691844,1.312110,4.691844
1,MWH_Runway04_night_cirrus_500_1,40945.441406,1.097656,-4.251509,1.863198,0.186320,10.024249,5.930713,0.197690,154.650466,...,1,500,1,0,nnet,1,1.312110,4.691844,1.312110,4.691844
2,MWH_Runway04_night_cirrus_500_2,40946.496094,2.152344,-2.417549,2.506637,0.250664,11.961238,6.316714,0.210557,160.038134,...,1,500,2,0,nnet,1,1.312110,4.691844,1.312110,4.691844
3,MWH_Runway04_night_cirrus_500_3,40947.519531,3.175781,-1.057059,3.119214,0.311921,6.168364,4.711604,0.157053,165.723392,...,1,500,3,0,nnet,1,1.312110,4.691844,1.312110,4.691844
4,MWH_Runway04_night_cirrus_500_4,40948.535156,4.191406,-1.990558,3.551921,0.355192,7.549158,2.804450,0.093482,171.531987,...,1,500,4,0,nnet,1,1.312110,4.691844,1.312110,4.691844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167661,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,0,999,209,3,nnet,499,-0.981114,-3.840276,-1.598571,-1.857308
167662,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,0,999,210,3,nnet,499,-0.981114,-3.840276,-1.598571,-1.857308
167663,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,0,999,211,3,nnet,499,-0.981114,-3.840276,-1.598571,-1.857308
167664,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,0,999,212,3,nnet,499,-0.981114,-3.840276,-1.598571,-1.857308


In [2]:
df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p1_processed_v1.pkl'))

df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,0,0,0,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,0,0,1,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,0,0,2,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,0,0,3,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,0,0,4,0,nnet,1,-0.865126,-3.389264,-0.865126,-3.389264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174433,MWH_Runway04_afternoon_broken_800_212,2544.641846,220.941650,-0.273577,-1.813623,-0.181362,2.566364,-1.562173,-0.052072,1314.666709,...,3,800,212,3,nnet,797,2.192799,5.911479,-2.287287,0.222896
174434,MWH_Runway04_afternoon_broken_800_213,2545.691895,221.991699,-0.298727,-1.961441,-0.196144,0.624838,-2.581139,-0.086038,1320.460157,...,3,800,213,3,nnet,797,2.192799,5.911479,-2.287287,0.222896
174435,MWH_Runway04_afternoon_broken_800_214,2546.757324,223.057129,-1.084329,-2.186303,-0.218630,-1.580742,-3.104577,-0.103486,1326.287263,...,3,800,214,3,nnet,797,2.192799,5.911479,-2.287287,0.222896
174436,MWH_Runway04_afternoon_broken_800_215,2547.810791,224.110596,-1.193321,-2.422252,-0.242225,-8.462737,-2.563973,-0.085466,1331.995826,...,3,800,215,3,nnet,797,2.192799,5.911479,-2.287287,0.222896


In [28]:
groups = df_test.groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day_first', 'cloud_type_first'])
# groups = df.groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])
num_groups = len(groups)
num_episodes = df_test['episode'].nunique()

if num_groups == num_episodes:
    print('The number of groups is the same as the number of episodes')
else:
    print(f'The number of groups ({num_groups}) is different from the number of episodes ({num_episodes})')

# groups.filter(lambda x: len(x) > 1)['episode'].unique().tolist()
# groups.filter(lambda x: len(x) > 1).groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])['episode'].unique()

The number of groups is the same as the number of episodes


In [26]:
# ensure that all the values in the 'period_of_day' and 'cloud_type' columns are the same for each episode
# if not, set them to the most first value for that episode

# create a new dataframe with the first value of each episode for the 'period_of_day' and 'cloud_type' columns
df_first = df.groupby('episode')[['period_of_day', 'cloud_type']].first().reset_index()

df_test = df.copy()

# merge the df_first dataframe with the df dataframe
df_test = df_test.merge(df_first, on='episode', how='left', suffixes=('', '_first'))

df_test

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,0,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405,0,1
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,1,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405,0,1
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,2,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405,0,1
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,3,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405,0,1
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,4,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,214,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670,2,0
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,215,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670,2,0
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,216,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670,2,0
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,217,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670,2,0


In [30]:
df = df_test

In [24]:
df = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_2nd_batch_processed_v1.pkl'))

df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,cloud_type,job_id,step_num,section,model_type,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,1,0,0,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,1,0,1,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,1,0,2,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,1,0,3,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,1,0,4,0,nnet,1297,0.308820,-2.437405,0.308820,-2.437405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,0,999,214,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,0,999,215,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,0,999,216,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,0,999,217,3,nnet,2293,2.677857,-2.329739,11.112828,0.189670


In [15]:
df.to_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_p1_processed_v2.pkl'))

In [107]:
df['timestamp'] = pd.to_datetime(df['step_num'], unit='s')

df


DatetimeIndex(['1970-01-01 00:00:00', '1970-01-01 00:00:01',
               '1970-01-01 00:00:02', '1970-01-01 00:00:03',
               '1970-01-01 00:00:04', '1970-01-01 00:00:05',
               '1970-01-01 00:00:06', '1970-01-01 00:00:07',
               '1970-01-01 00:00:08', '1970-01-01 00:00:09',
               ...
               '1970-01-01 00:03:29', '1970-01-01 00:03:30',
               '1970-01-01 00:03:31', '1970-01-01 00:03:32',
               '1970-01-01 00:03:33', '1970-01-01 00:03:34',
               '1970-01-01 00:03:35', '1970-01-01 00:03:36',
               '1970-01-01 00:03:37', '1970-01-01 00:03:38'],
              dtype='datetime64[ns]', name='timestamp', length=435470, freq=None)

In [110]:
df.groupby('episode').extract_group(1).asfreq('s')


AttributeError: 'DataFrameGroupBy' object has no attribute 'extract_group'

In [32]:
CTE_THRESHOLD = 3.0

df['cte_threshold'] = CTE_THRESHOLD

df['safety_metric_cte'] = df['actual_distance_to_centerline_meters'].abs() - df['cte_threshold']

# df['safety_metric_cte'] = df['safety_metric_cte'].round(2)
df['safety_metric_cte'] = df['safety_metric_cte']

df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:00,3.0,-2.691180
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:01,3.0,-2.726403
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:02,3.0,-2.337357
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:03,3.0,-1.455808
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:04,3.0,-0.227335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [4]:
#  read each df in the act_dataset_dict_v1.pkl dictionary
import pickle
from pathlib import Path
import pandas as pd

CTE_THRESHOLD = 5.0


# read the act_dataset_dict_v1.pkl file
with open(Path.cwd().joinpath('data', 'act_dataset_dict_v1.pkl'), 'rb') as f:
    act_dataset_dict = pickle.load(f)

# concat all df into data_df
data_df = pd.concat(act_dataset_dict.values(), axis=0)
data_df


Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
1970-01-01 00:00:00,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:00,3.0,-2.134874
1970-01-01 00:00:01,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:01,3.0,-2.024196
1970-01-01 00:00:02,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:02,3.0,-2.320128
1970-01-01 00:00:03,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:03,3.0,-2.913312
1970-01-01 00:00:04,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:04,3.0,-1.620443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:03:34,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
1970-01-01 00:03:35,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
1970-01-01 00:03:36,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
1970-01-01 00:03:37,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [5]:
data_df['cte_threshold'] = CTE_THRESHOLD
# recalculated the safety_metric_cte column
data_df['safety_metric_cte'] = data_df['actual_distance_to_centerline_meters'].abs() - data_df['cte_threshold']

# normalize the safety_metric_cte column using z-score normalization
data_df['safety_metric_cte_normalized_z_score'] = (data_df['safety_metric_cte'] - data_df['safety_metric_cte'].mean()) / data_df['safety_metric_cte'].std()
# normalize estimated_distance_to_centerline_meters column using z-score normalization
data_df['estimated_distance_to_centerline_meters_normalized_z_score'] = (data_df['estimated_distance_to_centerline_meters'] - data_df['estimated_distance_to_centerline_meters'].mean()) / data_df['estimated_distance_to_centerline_meters'].std()
# normalize actual_distance_to_centerline_meters column using z-score normalization
data_df['actual_distance_to_centerline_meters_normalized_z_score'] = (data_df['actual_distance_to_centerline_meters'] - data_df['actual_distance_to_centerline_meters'].mean()) / data_df['actual_distance_to_centerline_meters'].std()
# normalize the actual_heading_error_degrees column using z-score normalization
data_df['actual_heading_error_degrees_normalized_z_score'] = (data_df['actual_heading_error_degrees'] - data_df['actual_heading_error_degrees'].mean()) / data_df['actual_heading_error_degrees'].std()
# normalize the estimated_heading_error_degrees column using z-score normalization
data_df['estimated_heading_error_degrees_normalized_z_score'] = (data_df['estimated_heading_error_degrees'] - data_df['estimated_heading_error_degrees'].mean()) / data_df['estimated_heading_error_degrees'].std()

data_df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte,safety_metric_cte_normalized_z_score,estimated_distance_to_centerline_meters_normalized_z_score,actual_distance_to_centerline_meters_normalized_z_score,actual_heading_error_degrees_normalized_z_score,estimated_heading_error_degrees_normalized_z_score
1970-01-01 00:00:00,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,2,0,1970-01-01 00:00:00,5.0,-4.134874,-1.219922,-1.145133,-0.905220,-1.267205,-1.260096
1970-01-01 00:00:01,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,2,0,1970-01-01 00:00:01,5.0,-4.024196,-1.195263,-1.371716,-0.923158,0.069816,-0.999225
1970-01-01 00:00:02,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,2,0,1970-01-01 00:00:02,5.0,-4.320128,-1.261196,-1.223942,-0.875196,2.235631,-1.245379
1970-01-01 00:00:03,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,2,0,1970-01-01 00:00:03,5.0,-4.913312,-1.393356,-0.487332,-0.750956,4.601003,-0.604169
1970-01-01 00:00:04,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,2,0,1970-01-01 00:00:04,5.0,-3.620443,-1.105308,0.380892,-0.541416,6.349568,0.163675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:03:34,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,2,0,1970-01-01 00:03:34,5.0,6.165172,1.074904,-0.551751,1.044575,-0.689870,-0.535890
1970-01-01 00:03:35,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,2,0,1970-01-01 00:03:35,5.0,6.109047,1.062400,-0.998938,1.035479,0.101973,0.557720
1970-01-01 00:03:36,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,2,0,1970-01-01 00:03:36,5.0,6.224698,1.088167,-1.356806,1.054223,0.647968,1.877171
1970-01-01 00:03:37,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,2,0,1970-01-01 00:03:37,5.0,6.362616,1.118895,-1.305943,1.076576,0.200231,1.498462


In [13]:
ts_dict = {i: data_df.groupby(data_df.episode).get_group(i) for i in data_df.episode.unique()}

In [14]:
# save the ts_dict dictionary to a pickle file
with open(Path.cwd().joinpath('data', 'act_dataset_dict_v2.pkl'), 'wb') as f:
    pickle.dump(ts_dict, f)

In [16]:
# read static_features_df.pkl file

static_features_df = pd.read_pickle(Path.cwd().joinpath('data', 'static_features_v1.pkl'))

static_features_df

Unnamed: 0_level_0,ep_start_he_act,ep_start_cte_act,period_of_day_first,cloud_type_first
episode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-3.389264,-0.865126,2,0
2,-6.469483,1.644421,2,2
11,-4.569855,-1.449905,2,0
12,1.691482,1.348586,1,2
13,-0.928937,0.553173,1,0
...,...,...,...,...
1989,-1.309716,0.154888,2,0
1990,3.871852,2.969900,1,2
1991,-5.669751,2.183094,2,1
1992,-6.849759,0.088278,0,3


In [17]:
static_features_df_normalized_z_score = static_features_df.copy()

# normalize the static features using z-score normalization
static_features_df_normalized_z_score['ep_start_cte_act'] = (static_features_df_normalized_z_score['ep_start_cte_act'] - static_features_df_normalized_z_score['ep_start_cte_act'].mean()) / static_features_df_normalized_z_score['ep_start_cte_act'].std()
static_features_df_normalized_z_score['ep_start_he_act'] = (static_features_df_normalized_z_score['ep_start_he_act'] - static_features_df_normalized_z_score['ep_start_he_act'].mean()) / static_features_df_normalized_z_score['ep_start_he_act'].std()

pd.to_pickle(static_features_df_normalized_z_score, Path.cwd().joinpath('data', 'static_features_v1_normalized_z_score.pkl'))

static_features_df_normalized_z_score

Unnamed: 0_level_0,ep_start_he_act,ep_start_cte_act,period_of_day_first,cloud_type_first
episode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.586342,-0.499461,2,0
2,-1.119345,0.947787,2,2
11,-0.790632,-0.836702,2,0
12,0.292834,0.777180,1,2
13,-0.160605,0.318467,1,0
...,...,...,...,...
1989,-0.226495,0.088778,2,0
1990,0.670127,1.712187,1,2
1991,-0.980959,1.258438,2,1
1992,-1.185148,0.050364,0,3


In [18]:
static_features_df_normalized = static_features_df.copy()

static_features_df_normalized['ep_start_cte_act'] = static_features_df_normalized['ep_start_cte_act']/10.0
static_features_df_normalized['ep_start_he_act'] = static_features_df_normalized['ep_start_he_act']/90.0

pd.to_pickle(static_features_df_normalized, Path.cwd().joinpath('data', 'static_features_v1_normalized.pkl'))
static_features_df_normalized

Unnamed: 0_level_0,ep_start_he_act,ep_start_cte_act,period_of_day_first,cloud_type_first
episode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.037658,-0.086513,2,0
2,-0.071883,0.164442,2,2
11,-0.050776,-0.144991,2,0
12,0.018794,0.134859,1,2
13,-0.010322,0.055317,1,0
...,...,...,...,...
1989,-0.014552,0.015489,2,0
1990,0.043021,0.296990,1,2
1991,-0.062997,0.218309,2,1
1992,-0.076108,0.008828,0,3


In [1]:
#  read each df in the act_dataset_dict_v1.pkl dictionary
import pickle
from pathlib import Path
import pandas as pd

CTE_THRESHOLD = 5.0


# read the act_dataset_dict_v1.pkl file
with open(Path.cwd().joinpath('data', 'act_dataset_dict_v1.pkl'), 'rb') as f:
    act_dataset_dict = pickle.load(f)

# change the cte_threshold to 5.0
for key in act_dataset_dict.keys():
    act_dataset_dict[key]['cte_threshold'] = CTE_THRESHOLD
    # recalculated the safety_metric_cte column
    act_dataset_dict[key]['safety_metric_cte'] = act_dataset_dict[key]['actual_distance_to_centerline_meters'].abs() - act_dataset_dict[key]['cte_threshold']

    # normalize the safety_metric_cte column using z-score normalization
    act_dataset_dict[key]['safety_metric_cte_normalized_z_score'] = (act_dataset_dict[key]['safety_metric_cte'] - act_dataset_dict[key]['safety_metric_cte'].mean()) / act_dataset_dict[key]['safety_metric_cte'].std()
    # normalize estimated_distance_to_centerline_meters column using z-score normalization
    act_dataset_dict[key]['estimated_distance_to_centerline_meters_normalized_z_score'] = (act_dataset_dict[key]['estimated_distance_to_centerline_meters'] - act_dataset_dict[key]['estimated_distance_to_centerline_meters'].mean()) / act_dataset_dict[key]['estimated_distance_to_centerline_meters'].std()
    # normalize actual_distance_to_centerline_meters column using z-score normalization
    act_dataset_dict[key]['actual_distance_to_centerline_meters_normalized_z_score'] = (act_dataset_dict[key]['actual_distance_to_centerline_meters'] - act_dataset_dict[key]['actual_distance_to_centerline_meters'].mean()) / act_dataset_dict[key]['actual_distance_to_centerline_meters'].std()
    # normalize the actual_heading_error_degrees column using z-score normalization
    act_dataset_dict[key]['actual_heading_error_degrees_normalized_z_score'] = (act_dataset_dict[key]['actual_heading_error_degrees'] - act_dataset_dict[key]['actual_heading_error_degrees'].mean()) / act_dataset_dict[key]['actual_heading_error_degrees'].std()
    # normalize the estimated_heading_error_degrees column using z-score normalization
    act_dataset_dict[key]['estimated_heading_error_degrees_normalized_z_score'] = (act_dataset_dict[key]['estimated_heading_error_degrees'] - act_dataset_dict[key]['estimated_heading_error_degrees'].mean()) / act_dataset_dict[key]['estimated_heading_error_degrees'].std()
    
# save the act_dataset_dict_v2.pkl file
with open(Path.cwd().joinpath('data', 'act_dataset_dict_v2.pkl'), 'wb') as f:
    pickle.dump(act_dataset_dict, f)



In [3]:
with open(Path.cwd().joinpath('data', 'act_dataset_dict_v1.pkl'), 'rb') as f:
    act_dataset_dict = pickle.load(f)

list(act_dataset_dict.values())[0].columns

Index(['image_filename', 'absolute_time_GMT_seconds', 'relative_time_seconds',
       'estimated_distance_to_centerline_meters',
       'actual_distance_to_centerline_meters',
       'actual_distance_to_centerline_NORMALIZED',
       'estimated_heading_error_degrees', 'actual_heading_error_degrees',
       'actual_heading_error_NORMALIZED', 'downtrack_position_meters',
       'downtrack_position_NORMALIZED', 'period_of_day', 'cloud_type',
       'job_id', 'step_num', 'section', 'model_type', 'episode',
       'ep_start_cte_act', 'ep_start_he_act', 'sec_start_cte_act',
       'sec_start_he_act', 'period_of_day_first', 'cloud_type_first',
       'timestamp', 'cte_threshold', 'safety_metric_cte'],
      dtype='object')

In [64]:
groups = b1_p2_df.groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])
num_groups = len(groups)
num_episodes = b1_p2_df['episode'].nunique()

if num_groups == num_episodes:
    print('The number of groups is the same as the number of episodes')
else:
    print(f'The number of groups ({num_groups}) is different from the number of episodes ({num_episodes})')

# groups.filter(lambda x: len(x) > 1)['episode'].unique().tolist()
# groups.filter(lambda x: len(x) > 1).groupby(['ep_start_he_act', 'ep_start_cte_act', 'period_of_day', 'cloud_type'])['episode'].unique()

The number of groups (502) is different from the number of episodes (499)


In [34]:
# get the number of positive values in safety_metric_cte
df[df['safety_metric_cte'] < 0]['safety_metric_cte'].count()

92436

In [35]:
# Check if there is any missing values in df

df.isnull().values.any()

False

In [48]:
# ensure that all the values in the 'period_of_day' and 'cloud_type' columns are the same for each episode
# if not, set them to the most first value for that episode

# create a new dataframe with the first value of each episode for the 'period_of_day' and 'cloud_type' columns
df_first = df.groupby('episode')[['period_of_day', 'cloud_type']].first().reset_index()

df_test = df.copy()

# merge the df_first dataframe with the df dataframe
df_test = df_test.merge(df_first, on='episode', how='left', suffixes=('', '_first'))

df_test


Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,timestamp,cte_threshold,safety_metric_cte,period_of_day_first,cloud_type_first
0,MWH_Runway04_night_cirrus_500_0,40944.414062,0.070312,-4.975625,1.312110,0.131211,4.294211,4.691844,0.156395,149.288206,...,1,1.312110,4.691844,1.312110,4.691844,1970-01-01 00:00:00,3.0,-1.687890,2,1
1,MWH_Runway04_night_cirrus_500_1,40945.441406,1.097656,-4.251509,1.863198,0.186320,10.024249,5.930713,0.197690,154.650466,...,1,1.312110,4.691844,1.312110,4.691844,1970-01-01 00:00:01,3.0,-1.136802,2,1
2,MWH_Runway04_night_cirrus_500_2,40946.496094,2.152344,-2.417549,2.506637,0.250664,11.961238,6.316714,0.210557,160.038134,...,1,1.312110,4.691844,1.312110,4.691844,1970-01-01 00:00:02,3.0,-0.493363,2,1
3,MWH_Runway04_night_cirrus_500_3,40947.519531,3.175781,-1.057059,3.119214,0.311921,6.168364,4.711604,0.157053,165.723392,...,1,1.312110,4.691844,1.312110,4.691844,1970-01-01 00:00:03,3.0,0.119214,2,1
4,MWH_Runway04_night_cirrus_500_4,40948.535156,4.191406,-1.990558,3.551921,0.355192,7.549158,2.804450,0.093482,171.531987,...,1,1.312110,4.691844,1.312110,4.691844,1970-01-01 00:00:04,3.0,0.551921,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109286,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,499,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:29,3.0,-2.482921,1,0
109287,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,499,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:30,3.0,-2.449016,1,0
109288,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,499,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:31,3.0,-2.372977,1,0
109289,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,499,-0.981114,-3.840276,-1.598571,-1.857308,1970-01-01 00:03:32,3.0,-2.277358,1,0


In [2]:
# df = df_test
df_1 = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_processed_v2.pkl'))
df_2 = pd.read_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_2nd_batch_processed_v2.pkl'))

In [4]:
df_2

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:00,3.0,-2.691180
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:01,3.0,-2.726403
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:02,3.0,-2.337357
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:03,3.0,-1.455808
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,1297,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:04,3.0,-0.227335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,2293,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [5]:
df_2['episode'] = df_2['episode'] - 1296
df_2

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,1,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:00,3.0,-2.691180
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,1,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:01,3.0,-2.726403
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,1,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:02,3.0,-2.337357
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,1,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:03,3.0,-1.455808
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,1,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:04,3.0,-0.227335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,997,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,997,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,997,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,997,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [6]:
# concat df_1 and df_2 dataframes and reset their index.
# Ensure that the episode numbers are unique

num_episodes_b1 = df_1['episode'].nunique()

df_2['episode'] = df_2['episode'] + num_episodes_b1

df_2

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_morning_cirrus_0_0,47078.960938,0.093750,-7.626698,0.308820,0.030882,0.096835,-2.437405,-0.081247,149.681252,...,997,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:00,3.0,-2.691180
1,MWH_Runway04_morning_cirrus_0_1,47080.000000,1.132812,-7.543778,0.273597,0.027360,-2.180710,0.671607,0.022387,154.976130,...,997,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:01,3.0,-2.726403
2,MWH_Runway04_morning_cirrus_0_2,47081.046875,2.179688,-5.622612,0.662643,0.066264,2.350708,6.090438,0.203015,160.510583,...,997,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:02,3.0,-2.337357
3,MWH_Runway04_morning_cirrus_0_3,47082.093750,3.226562,-2.442633,1.544192,0.154419,8.348513,10.387065,0.346236,166.129236,...,997,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:03,3.0,-1.455808
4,MWH_Runway04_morning_cirrus_0_4,47083.156250,4.289062,-3.475306,2.772665,0.277266,14.092685,11.247550,0.374918,172.155532,...,997,0.308820,-2.437405,0.308820,-2.437405,0,1,1970-01-01 00:00:04,3.0,-0.227335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217585,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
217586,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
217587,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
217588,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [8]:
df = pd.concat([df_1, df_2], axis=0)
df = df.reset_index(drop=True)

df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:00,3.0,-2.134874
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:01,3.0,-2.024196
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:02,3.0,-2.320128
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:03,3.0,-2.913312
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:04,3.0,-1.620443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435465,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
435466,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
435467,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
435468,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [11]:
df.to_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_v2.pkl'))

Time axis: `'timestamp'`

Target: `'safety_metric_cte'`

Static Categorical Features: `'period_of_day_first'`, `'cloud_type_first'`

Static Real Features: `'ep_start_cte_act'`, `'ep_start_he_act'`

Dynamic Real Features: `'cte'`, `'he'`

Item id: `'episode'`

In [52]:
static_feature_columns = [
        "ep_start_he_act",
        "ep_start_cte_act",
        "period_of_day_first",
        "cloud_type_first",
    ]
item_id = "episode"
other_static_features = (
                df[[item_id] + static_feature_columns]
                .drop_duplicates()
                .set_index(item_id)
            )
other_static_features
# find the duplicate indexes

other_static_features[other_static_features.index.duplicated(keep=False)]

# other_static_features[other_static_features['episode'].duplicated()]['episode'].unique().tolist()

# df['episode'].nunique()

Unnamed: 0_level_0,ep_start_he_act,ep_start_cte_act,period_of_day_first,cloud_type_first
episode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [12]:
df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:00,3.0,-2.134874
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:01,3.0,-2.024196
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:02,3.0,-2.320128
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:03,3.0,-2.913312
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:04,3.0,-1.620443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435465,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
435466,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
435467,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
435468,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [44]:
df.to_pickle(Path.cwd().joinpath('data', 'act_dataset_parts', 'data_df_1st_batch_processed_v2.pkl'))
df

Unnamed: 0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
0,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:00,3.0,-2.134874
1,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:01,3.0,-2.024196
2,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:02,3.0,-2.320128
3,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:03,3.0,-2.913312
4,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:04,3.0,-1.620443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217875,MWH_Runway04_afternoon_clear_999_209,292.521912,215.326210,2.379629,-0.517079,-0.051708,-4.045362,-0.816994,-0.027233,1318.782559,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:29,3.0,-2.482921
217876,MWH_Runway04_afternoon_clear_999_210,293.559906,216.364204,2.937276,-0.550984,-0.055098,-4.501310,-1.146245,-0.038208,1324.604870,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:30,3.0,-2.449016
217877,MWH_Runway04_afternoon_clear_999_211,294.627441,217.431740,2.653418,-0.627023,-0.062702,-7.868939,-1.561669,-0.052056,1330.485892,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:31,3.0,-2.372977
217878,MWH_Runway04_afternoon_clear_999_212,295.632751,218.437050,2.740055,-0.722642,-0.072264,-4.798201,-1.246892,-0.041563,1336.086013,...,996,-0.981114,-3.840276,-1.598571,-1.857308,1,0,1970-01-01 00:03:32,3.0,-2.277358


In [9]:
df['cloud_type_first'] = df['cloud_type_first'].astype('category')
df['period_of_day_first'] = df['period_of_day_first'].astype('category')

In [21]:
df['period_of_day_first'].unique()

[2, 1, 0]
Categories (3, int64): [0, 1, 2]

In [74]:
from gluonts.dataset.pandas import PandasDataset

ds_test = PandasDataset.from_long_dataframe(
    dataframe=df,
    target="safety_metric_cte",
    timestamp="timestamp",
    # timestamp="step_num",
    item_id="episode",
    static_feature_columns=[
        "ep_start_he_act",
        "ep_start_cte_act",
        "period_of_day_first",
        "cloud_type_first",
    ],
    feat_dynamic_real=[
        "estimated_distance_to_centerline_meters",
        "estimated_heading_error_degrees",
    ],
    # unchecked=True
    )

In [75]:
ds_test

PandasDataset<size=1993, freq=S, num_feat_dynamic_real=2, num_past_feat_dynamic_real=0, num_feat_static_real=2, num_feat_static_cat=2, static_cardinalities=[3. 4.]>

In [15]:
with open(Path.cwd().joinpath('data', 'act_dataset_gts_v1.pkl'), 'wb') as f:
    pickle.dump(ds, f)

In [17]:
with open(Path.cwd().joinpath('data', 'act_dataset_gts_v1.pkl'), 'rb') as f:
    ds = pickle.load(f)

ds

PandasDataset<size=1993, freq=S, num_feat_dynamic_real=2, num_past_feat_dynamic_real=0, num_feat_static_real=2, num_feat_static_cat=2, static_cardinalities=[3. 4.]>

In [20]:
len(ds)

1993

In [104]:
df.asfreq('S').index

ValueError: cannot reindex on an axis with duplicate labels

In [2]:
df = pd.read_pickle(r'data\act_dataframe_v2.pkl')

In [102]:
import gluonts.dataset.pandas as gpd

# gpd.is_uniform(pd.DatetimeIndex(df.groupby(df.episode).get_group(2).index))
# gpd.is_uniform(pd.DatetimeIndex(df.index))
# df.groupby(df.episode).get_group(2).index
freq = pd.infer_freq(df.timestamp)
type(freq)

NoneType

In [6]:
df.groupby(df.episode).ngroups

1993

In [7]:
ts_dict = {i: df.groupby(df.episode).get_group(i) for i in df.episode.unique()}

In [11]:
ts_dict[1993].columns

Index(['image_filename', 'absolute_time_GMT_seconds', 'relative_time_seconds',
       'estimated_distance_to_centerline_meters',
       'actual_distance_to_centerline_meters',
       'actual_distance_to_centerline_NORMALIZED',
       'estimated_heading_error_degrees', 'actual_heading_error_degrees',
       'actual_heading_error_NORMALIZED', 'downtrack_position_meters',
       'downtrack_position_NORMALIZED', 'period_of_day', 'cloud_type',
       'job_id', 'step_num', 'section', 'model_type', 'episode',
       'ep_start_cte_act', 'ep_start_he_act', 'sec_start_cte_act',
       'sec_start_he_act', 'period_of_day_first', 'cloud_type_first',
       'timestamp', 'cte_threshold', 'safety_metric_cte'],
      dtype='object')

In [12]:
for ts in ts_dict.values():
    ts.reset_index(drop=True, inplace=True)

In [23]:
freq = '1S'
for ts in ts_dict.values():
    time = pd.period_range(start="1970-01-01 00:00:00", periods=len(ts), freq=freq)
    ts.set_index(time, inplace=True)
    # # add name to the index
    # ts.index.name = 'timestamp'



In [57]:
with open(Path.cwd().joinpath('data', 'act_dataset_dict_v1.pkl'), 'wb') as f:
    pickle.dump(ts_dict, f)

In [26]:
ts_dict[1993].index.to_timestamp()
# ts_dict[1993].absolute_time_GMT_seconds.min()
# transform absolute time to datetime
# pd.to_datetime(ts_dict[1993]['absolute_time_GMT_seconds'], unit='s')


DatetimeIndex(['1970-01-01 00:00:00', '1970-01-01 00:00:01',
               '1970-01-01 00:00:02', '1970-01-01 00:00:03',
               '1970-01-01 00:00:04', '1970-01-01 00:00:05',
               '1970-01-01 00:00:06', '1970-01-01 00:00:07',
               '1970-01-01 00:00:08', '1970-01-01 00:00:09',
               ...
               '1970-01-01 00:03:29', '1970-01-01 00:03:30',
               '1970-01-01 00:03:31', '1970-01-01 00:03:32',
               '1970-01-01 00:03:33', '1970-01-01 00:03:34',
               '1970-01-01 00:03:35', '1970-01-01 00:03:36',
               '1970-01-01 00:03:37', '1970-01-01 00:03:38'],
              dtype='datetime64[ns]', length=219, freq='S')

In [29]:
# create a static_features dataframe that contains the static features for each episode
# the index of static_features is the episode number
# the columns of static_features are the static features
# the columns are 'ep_start_he_act', 'ep_start_cte_act', 'period_of_day_first', 'cloud_type_first'

static_features = df[['episode', 'ep_start_he_act', 'ep_start_cte_act', 'period_of_day_first', 'cloud_type_first']].drop_duplicates().set_index('episode')

static_features

Unnamed: 0_level_0,ep_start_he_act,ep_start_cte_act,period_of_day_first,cloud_type_first
episode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-3.389264,-0.865126,2,0
2,-6.469483,1.644421,2,2
11,-4.569855,-1.449905,2,0
12,1.691482,1.348586,1,2
13,-0.928937,0.553173,1,0
...,...,...,...,...
1989,-1.309716,0.154888,2,0
1990,3.871852,2.969900,1,2
1991,-5.669751,2.183094,2,1
1992,-6.849759,0.088278,0,3


In [56]:
static_features.to_pickle(Path.cwd().joinpath('data', 'static_features_v1.pkl'))

In [73]:
from gluonts.dataset.pandas import PandasDataset

ds_test = PandasDataset(
    dataframes=ts_dict,
    target="safety_metric_cte",
    static_features=static_features,
    # feat_dynamic_real=[
    #     "estimated_distance_to_centerline_meters",
    #     "estimated_heading_error_degrees",
    # ],
    past_feat_dynamic_real=[
        "estimated_distance_to_centerline_meters",
        "estimated_heading_error_degrees",
    ],
    # unchecked=True
    )

In [74]:
ds_test

PandasDataset<size=1993, freq=S, num_feat_dynamic_real=0, num_past_feat_dynamic_real=2, num_feat_static_real=2, num_feat_static_cat=2, static_cardinalities=[3. 4.]>

In [33]:
ds_test

PandasDataset<size=1993, freq=S, num_feat_dynamic_real=2, num_past_feat_dynamic_real=0, num_feat_static_real=2, num_feat_static_cat=2, static_cardinalities=[3. 4.]>

In [55]:
with open(Path.cwd().joinpath('data', 'act_dataset_gts_v2.pkl'), 'wb') as f:
    pickle.dump(ds_test, f)

TypeError: cannot pickle 'dict_items' object

# Dataset split

In [75]:
from gluonts.dataset.split import split

train_and_val_ds, test_gen = split(ds_test, offset=-40)
# train_and_val_ds, test_gen = split(ds_test, date=pd.Period("1970-01-01 00:02:40", freq="1S"))

In [63]:
next(iter(test_data.input))

{'start': Period('1970-01-01 00:02:52', 'S'),
 'target': array([8.29921748, 8.31906627, 8.19722611, 8.02990983, 7.95817604,
        8.06494781, 8.23496546, 8.31496717, 8.25719377]),
 'item_id': 1,
 'feat_static_cat': array([2., 0.], dtype=float32),
 'feat_static_real': array([-3.3892639, -0.8651258], dtype=float32),
 'feat_dynamic_real': array([[-7.30774809, -7.63096498, -6.75447066, -5.01340053, -5.59287717,
         -7.36951753, -7.37038094, -7.54480849, -6.99332995, -5.05708935,
         -5.83817577, -7.15848906],
        [17.95104415, 17.77939309,  7.80503983,  1.28227178,  6.57823778,
         18.24048968, 18.69387777, 18.23273829, 11.11642467,  2.66267722,
          7.68512171, 17.85952697]])}

In [54]:
# next(iter(train_and_val_ds)).keys()
next(iter(train_and_val_ds))['feat_static_real']

array([-3.3892639, -0.8651258], dtype=float32)

In [76]:
test_data = test_gen.generate_instances(
    prediction_length=3,
    windows=38,
    distance=1,
    max_history=9
)

In [72]:
import json

with open(Path.cwd().joinpath('data', 'test_data_v1.json'), 'w') as f:
    json_obj = json.dumps(test_data)
    f.write(json_obj)

test_data

TypeError: Object of type TestData is not JSON serializable

In [77]:
from gluonts.evaluation.backtest import _to_dataframe

test_data_it = map(_to_dataframe, test_data)
test_list = list(test_data_it)

test_list

[                            0
 1970-01-01 00:02:52  8.299217
 1970-01-01 00:02:53  8.319066
 1970-01-01 00:02:54  8.197226
 1970-01-01 00:02:55  8.029910
 1970-01-01 00:02:56  7.958176
 1970-01-01 00:02:57  8.064948
 1970-01-01 00:02:58  8.234965
 1970-01-01 00:02:59  8.314967
 1970-01-01 00:03:00  8.257194
 1970-01-01 00:03:01  8.127925
 1970-01-01 00:03:02  8.045867
 1970-01-01 00:03:03  8.097417,
                             0
 1970-01-01 00:02:53  8.319066
 1970-01-01 00:02:54  8.197226
 1970-01-01 00:02:55  8.029910
 1970-01-01 00:02:56  7.958176
 1970-01-01 00:02:57  8.064948
 1970-01-01 00:02:58  8.234965
 1970-01-01 00:02:59  8.314967
 1970-01-01 00:03:00  8.257194
 1970-01-01 00:03:01  8.127925
 1970-01-01 00:03:02  8.045867
 1970-01-01 00:03:03  8.097417
 1970-01-01 00:03:04  8.202750,
                             0
 1970-01-01 00:02:54  8.197226
 1970-01-01 00:02:55  8.029910
 1970-01-01 00:02:56  7.958176
 1970-01-01 00:02:57  8.064948
 1970-01-01 00:02:58  8.234965
 1970-

In [68]:
len(test_list)

75734

In [85]:
df

Unnamed: 0_level_0,image_filename,absolute_time_GMT_seconds,relative_time_seconds,estimated_distance_to_centerline_meters,actual_distance_to_centerline_meters,actual_distance_to_centerline_NORMALIZED,estimated_heading_error_degrees,actual_heading_error_degrees,actual_heading_error_NORMALIZED,downtrack_position_meters,...,episode,ep_start_cte_act,ep_start_he_act,sec_start_cte_act,sec_start_he_act,period_of_day_first,cloud_type_first,timestamp,cte_threshold,safety_metric_cte
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-01 00:00:00,MWH_Runway04_night_clear_0_0,17702.781250,0.078125,-7.236064,-0.865126,-0.086513,-3.206775,-3.389264,-0.112975,149.427741,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:00,3.0,-2.134874
1970-01-01 00:00:01,MWH_Runway04_night_clear_0_1,17703.806641,1.103516,-7.916872,-0.975804,-0.097580,-1.128353,-0.173505,-0.005783,154.553073,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:01,3.0,-2.024196
1970-01-01 00:00:02,MWH_Runway04_night_clear_0_2,17704.847656,2.144531,-7.472859,-0.679872,-0.067987,-3.089520,5.035644,0.167855,159.895690,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:02,3.0,-2.320128
1970-01-01 00:00:03,MWH_Runway04_night_clear_0_3,17705.855469,3.152344,-5.259592,0.086688,0.008669,2.019156,10.724761,0.357492,164.951010,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:03,3.0,-2.913312
1970-01-01 00:00:04,MWH_Runway04_night_clear_0_4,17706.945312,4.242188,-2.650868,1.379557,0.137956,8.136747,14.930354,0.497678,170.304711,...,1,-0.865126,-3.389264,-0.865126,-3.389264,2,0,1970-01-01 00:00:04,3.0,-1.620443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:03:34,MWH_Runway04_night_clear_999_214,37190.046875,220.082031,-5.453149,11.165172,1.116517,2.563150,-2.000676,-0.066689,1315.119848,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:34,3.0,8.165172
1970-01-01 00:03:35,MWH_Runway04_night_clear_999_215,37191.148438,221.183594,-6.796797,11.109047,1.110905,11.276200,-0.096162,-0.003205,1321.334900,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:35,3.0,8.109047
1970-01-01 00:03:36,MWH_Runway04_night_clear_999_216,37192.148438,222.183594,-7.872072,11.224698,1.122470,21.788580,1.217048,0.040568,1326.781328,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:36,3.0,8.224698
1970-01-01 00:03:37,MWH_Runway04_night_clear_999_217,37193.164062,223.199219,-7.719244,11.362616,1.136262,18.771313,0.140166,0.004672,1332.525011,...,1993,2.677857,-2.329739,11.112828,0.189670,2,0,1970-01-01 00:03:37,3.0,8.362616


In [93]:
df.groupby(df['episode']).ngroups

1993