In [17]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import BinaryEncoder

In [18]:
df = pd.read_csv('train_data.csv')
x_scalar = MinMaxScaler()
y_scalar = MinMaxScaler()
# Define the timestep
timestep = 5
# Define the fill value for the X array
fill_X = 0


In [19]:
# Drop any rows with missing values
df.dropna(inplace=True, how='any')
# Filter the dataframe based on conditions
def conditions(event):
    x = event["time_to_tca"].values
    return ((x.min()<2.0) & (x.max()>2.0) & (x.shape[0]>1))
df = df.groupby('event_id').filter(conditions)
# Convert categorical variable into dummy/indicator variables
df = pd.get_dummies(df)
# Encode the 'mission_id' column
encoder  = BinaryEncoder(cols=['mission_id'], drop_invariant=True)
df = encoder.fit_transform(df)
# Get the 'risk' value of the last row of each group
y_original = df.groupby('event_id')['risk'].apply(lambda x: x.iloc[-1]).values.reshape(-1,1)
# Fit and transform the 'risk' column
_ = y_scalar.fit(df["risk"].values.reshape(-1,1))
y = y_scalar.transform(y_original)
# Filter the dataframe based on 'time_to_tca' column
df = df.loc[df['time_to_tca']>3.0]
# Add a new column 'event_length' which is the count of 'event_id' in each group
df["event_length"] = df.groupby('event_id')['event_id'].transform('count')

# Fit and transform the dataframe
df = pd.DataFrame(x_scalar.fit_transform(df), columns=df.columns)





In [20]:
#transforming into 3d array
events = df['event_id'].nunique()
features = len(df.columns)
X = np.zeros((events, timestep, features))
X.fill(fill_X)
i = 0
# Convert the dataframe to a 3D array
def df_to_3darr(event):
    global X, i
    row = event.values.reshape(1, event.shape[0], event.shape[1])
    if timestep >= row.shape[1]:
        X[i:i+1, -row.shape[1]:, :] = row
    else:
        X[i:i+1, :,:] = row[:,-timestep:, :]
    i = i+1
    return event
df.groupby('event_id').apply(df_to_3darr)
# Remove the 'event_id' column
X= X[:,:,1:]
# Reshape the X array
X = X.reshape(X.shape[0], timestep*X.shape[2])
# Create new column names for the X array
Shifted_Columns = []
original_columns = list(df.columns)[1:]
for i in range(timestep):
    for col in original_columns:
        Shifted_Columns.append(col+"_t-"+str(i))
# Convert the X array to a dataframe
X = pd.DataFrame(X, columns=Shifted_Columns)
# Print the shapes of X and y
print(X.shape, y.shape)



(4667, 550) (5015, 1)


In [21]:
# Return X, y, x_scalar, y_scalar
X,y, x_scalar,y_scalar

(      time_to_tca_t-0  mission_id_0_t-0  mission_id_1_t-0  mission_id_2_t-0  \
 0            0.395146               0.0               0.0               0.0   
 1            0.531060               0.0               0.0               0.0   
 2            0.330410               0.0               0.0               0.0   
 3            0.406901               0.0               0.0               1.0   
 4            0.352432               0.0               0.0               0.0   
 ...               ...               ...               ...               ...   
 4662         0.780712               0.0               0.0               1.0   
 4663         0.350776               0.0               0.0               1.0   
 4664         0.330326               0.0               0.0               1.0   
 4665         0.000000               0.0               0.0               0.0   
 4666         0.367598               0.0               1.0               0.0   
 
       mission_id_3_t-0  mission_id_4_