# Preprocessing the data

In [56]:
import pandas as pd
import numpy as np
from tsai.all import SlidingWindow
from sklearn.model_selection import train_test_split

## loading in the data

In [20]:
main_avg_df = pd.read_parquet('main_avg_dataset1.parquet')

## Create new columns that are shifted (label columns)

In [21]:
main_avg_df.columns

Index(['lon', 'lat', 'time', 'lev', 'Ls', 'MY', 'ps', 'tsurf', 'co2ice',
       'dustcol', 'u', 'v', 'temp'],
      dtype='object')

### select columns to be shifted

In [22]:
shift_cols = ['dustcol', 'u', 'v', 'tsurf']

### Columns are shifted 84 places which is equivalent to 7 sols (one martian week)

In [23]:
# shift the selected colunms by 84 
for col in shift_cols:
    main_avg_df[f'{col}_shift'] = main_avg_df[col].shift(84)

In [24]:
# drop the first 84 rows
main_avg_df = main_avg_df.iloc[84:]


### Select columns to be normalized

In [25]:
norm_cols = ['ps', 'tsurf', 'dustcol', 'u', 'v', 'temp']

### Normalize columns

In [26]:
# Apply min-max scaling for the selected columns on the dataframe
for col in norm_cols:
    main_avg_df[f'{col}_norm'] = (main_avg_df[col] - main_avg_df[col].min()) / (
        main_avg_df[col].max() - main_avg_df[col].min())


### Export dataframe to parquet

In [27]:
# export the dataframe to a parquet file
main_avg_df.to_parquet('main_dataset_shifted_normalized1.parquet')

# Apply sliding window technique and split data into X and Y variables

In [50]:
main_df = pd.read_parquet('main_dataset_shifted_normalized1.parquet')

In [51]:
x_vars = ['ps_norm', 'dustcol_norm', 'u_norm', 'v_norm', 'tsurf_norm']
y_vars = ['dustcol_shift', 'u_shift', 'v_shift', 'tsurf_shift']

In [52]:
X, y = SlidingWindow(window_len=42, get_x=x_vars, get_y=y_vars)(main_df)

In [53]:
X = np.moveaxis(X, 1, -1)

In [54]:
X.shape

(5994, 42, 5)

In [55]:
np.save('X1.npy', X)
np.save('y1.npy', y)