In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import BinaryEncoder

In [5]:
df = pd.read_csv("train_data.csv")
timestep = 5
fill_X = 0
X_scaler = StandardScaler()
y_scaler = StandardScaler()
for col in df.columns:
    print(col)

event_id
time_to_tca
mission_id
risk
max_risk_estimate
max_risk_scaling
miss_distance
relative_speed
relative_position_r
relative_position_t
relative_position_n
relative_velocity_r
relative_velocity_t
relative_velocity_n
t_time_lastob_start
t_time_lastob_end
t_recommended_od_span
t_actual_od_span
t_obs_available
t_obs_used
t_residuals_accepted
t_weighted_rms
t_rcs_estimate
t_cd_area_over_mass
t_cr_area_over_mass
t_sedr
t_j2k_sma
t_j2k_ecc
t_j2k_inc
t_ct_r
t_cn_r
t_cn_t
t_crdot_r
t_crdot_t
t_crdot_n
t_ctdot_r
t_ctdot_t
t_ctdot_n
t_ctdot_rdot
t_cndot_r
t_cndot_t
t_cndot_n
t_cndot_rdot
t_cndot_tdot
c_object_type
c_time_lastob_start
c_time_lastob_end
c_recommended_od_span
c_actual_od_span
c_obs_available
c_obs_used
c_residuals_accepted
c_weighted_rms
c_rcs_estimate
c_cd_area_over_mass
c_cr_area_over_mass
c_sedr
c_j2k_sma
c_j2k_ecc
c_j2k_inc
c_ct_r
c_cn_r
c_cn_t
c_crdot_r
c_crdot_t
c_crdot_n
c_ctdot_r
c_ctdot_t
c_ctdot_n
c_ctdot_rdot
c_cndot_r
c_cndot_t
c_cndot_n
c_cndot_rdot
c_cndot_tdot
t

In [6]:
#Dropping first the empty column and then rows with NaNs
df = df.drop('c_rcs_estimate', axis=1)
df = df.dropna(how='any')

#Filtering events with len=1 or min_tca > 2 or max_tca < 2
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))

df = df.groupby('event_id').filter(conditions)

#OHE for c_object_type (5 categories) -> 5 new features
df = pd.get_dummies(df)

#Binary encoder for mission_id (19 categories) -> 5 new features
encoder = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)

#Getting y as 1D-array
y_original = df.groupby(["event_id"])["risk"].apply(lambda x: x.iloc[-1]).values.reshape(-1, 1)

#Scaling y
_ = y_scaler.fit(df["risk"].values.reshape(-1, 1)) #using the whole risk feature to scale the target 'y'
y = y_scaler.transform(y_original)

#Getting X as df (dropping rows with tca < 2) 
df = df.loc[df["time_to_tca"]>2]

#Adding feature 'event_length' for counting how many instances each event has
df["event_length"] = df.groupby('event_id')['event_id'].transform('count')

#Scaling X
df = pd.DataFrame(X_scaler.fit_transform(df), columns=df.columns)






In [7]:
#Transforming X into a 3D-array
events = df["event_id"].nunique() #rows
features = len(df.columns) #columns
X = np.zeros((events,timestep,features))
X.fill(fill_X)
i = 0

def df_to_3darray(event):
    global X, i
    #Transforming an event to time series (1,timesteps, columns)
    row = event.values.reshape(1,event.shape[0],event.shape[1])
    #Condition is needed to slice arrays correctly
    #Condition -> is timestep greater than the event's time series length? 
    if(timestep>=row.shape[1]):
        X[i:i+1,-row.shape[1]:,:] = row
    else:
        X[i:i+1,:,:] = row[:,-timestep:,:]
    #index to iterate over X array
    i = i + 1
    #dataframe remains intact, while X array has been filled.
    return event

df.groupby("event_id").apply(df_to_3darray)

#Dropping event_id to remove noise
X = X[:,:,1:]
#Reshaping again to 2D array but now events are filled
X = X.reshape(X.shape[0], timestep*X.shape[2])

#Naming shifted columns
shifted_columns = []
original_columns = list(df.columns)[1:] #Dropping event_id

for i in range(timestep-1,-1,-1):
    for column in original_columns: 
        shifted_columns.append(column+"_t-"+str(i))
        
#Creating df from reshape array and shifted column names
X = pd.DataFrame(X, columns=shifted_columns)

print(X.shape, y.shape)




(7311, 550) (7311, 1)


In [12]:
# Return X, y, x_scalar, y_scalar
X, y, X_scaler, y_scaler

(      time_to_tca_t-4  mission_id_0_t-4  mission_id_1_t-4  mission_id_2_t-4  \
 0           -0.181972         -0.059159         -0.570809         -0.725025   
 1           -0.543427         -0.059159         -0.570809         -0.725025   
 2           -0.629641         -0.059159         -0.570809         -0.725025   
 3           -0.691877         -0.059159         -0.570809          1.379263   
 4           -0.542515         -0.059159         -0.570809          1.379263   
 ...               ...               ...               ...               ...   
 7306        -0.617534         -0.059159         -0.570809          1.379263   
 7307        -0.671409         -0.059159         -0.570809          1.379263   
 7308         0.000000          0.000000          0.000000          0.000000   
 7309        -0.579888         -0.059159          1.751900         -0.725025   
 7310        -0.635483         -0.059159         -0.570809          1.379263   
 
       mission_id_3_t-4  mission_id_4_