In [1]:
# =============================
# library
# =============================
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import polars as pl
from tqdm import tqdm
import pickle

In [2]:
# =============================
# constant
# =============================
TRAIN_PATH = Path("../storage/leap/data/train.csv")
TEST_PATH = Path("../storage/leap/data/test.csv")
OUTPUT_DIR = Path("../storage/leap/output")
DATA_DIR = Path("../storage/leap/data")

In [3]:
# =============================
# settings
# =============================
fe = "131"
fe_dir = OUTPUT_DIR / "fe" / f"fe{fe}"
fe_dir.mkdir(parents=True, exist_ok=True)
fe_save_dir = fe_dir / "save"
fe_save_dir.mkdir(parents=True, exist_ok=True)

TRAIN_PATH1 = Path("../storage/leap/output/fe/fe129/fe129_train.parquet")
train_diff_sc_path = OUTPUT_DIR / "fe" / "fe102" / "fe102_train_diff_mean_std.pkl"

In [4]:
# =============================
# columns
# =============================
state_t = [f'state_t_{i}' for i in range(60)]
state_q0001 = [f'state_q0001_{i}' for i in range(60)]
state_q0002 = [f'state_q0002_{i}' for i in range(60)]
state_q0003 = [f'state_q0003_{i}' for i in range(60)]
state_u = [f'state_u_{i}' for i in range(60)]
state_v = [f'state_v_{i}' for i in range(60)]
other = ['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX',
       'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR',
       'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC',
       'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHLAND']
pbuf_ozone = [f'pbuf_ozone_{i}' for i in range(60)]
pbuf_CH4 = [f'pbuf_CH4_{i}' for i in range(60)]
pbuf_N2O = [f'pbuf_N2O_{i}' for i in range(60)]
cols_list = [state_t,state_q0001,state_q0002,state_q0003,
             state_u,state_v,pbuf_ozone,pbuf_CH4,pbuf_N2O,other]

In [5]:
# =============================
# main
# =============================
df = pl.read_parquet(TRAIN_PATH1)

In [6]:
with open(train_diff_sc_path  , 'rb') as f:
    train_diff_sc_dict = pickle.load(f) 

In [7]:
train_diff_sc_dict

{'state_t_diff_0': [12.19473677257737, 7.264644181576466],
 'state_t_diff_1': [9.308592506211298, 6.146281502720454],
 'state_t_diff_2': [10.556464302056913, 5.1373522687492965],
 'state_t_diff_3': [8.325129867728295, 5.201566733158169],
 'state_t_diff_4': [3.218504554263811, 5.285484789058961],
 'state_t_diff_5': [-4.177604237048966, 3.6866071779181495],
 'state_t_diff_6': [-8.558437638758386, 2.379748878107623],
 'state_t_diff_7': [-9.70294133428214, 2.337023243783227],
 'state_t_diff_8': [-6.726035591063095, 1.8504479176826751],
 'state_t_diff_9': [-5.256081522638857, 1.63672729611797],
 'state_t_diff_10': [-4.173135911097179, 1.6117236622549198],
 'state_t_diff_11': [-3.859679163598914, 1.8813404729373777],
 'state_t_diff_12': [-3.246934147007651, 1.9316416719121765],
 'state_t_diff_13': [-3.3977011650391775, 2.358304470372667],
 'state_t_diff_14': [-3.5342297998580645, 2.8584300225024437],
 'state_t_diff_15': [-4.234526141439417, 3.911137177076367],
 'state_t_diff_16': [-2.8048292

In [8]:
new_df = pl.DataFrame()
for c in cols_list:
    if len(c) == 60:
        for i in range(60):
            if i == 59:
                new_df = new_df.with_columns((pl.lit(0).alias(f"{c[i]}_diff")))
            else:
                new_df = new_df.with_columns((df[c[i+1]] - df[c[i]]).alias(f"{c[i]}_diff"))

In [9]:
for c in tqdm(cols_list):
    if len(c) == 60:
        prefix = "_".join(c[0].split("_")[:2]) + "_diff"
        c = [f"{i}_diff" for i in c]
        df_ = new_df[c].to_numpy()
        for i in range(60):
            df_[:,i] = (df_[:,i] - train_diff_sc_dict[f"{prefix}_{i}"][0]) / train_diff_sc_dict[f"{prefix}_{i}"][1]
        df_ = df_.astype(np.float32)
        np.save(fe_dir / f"fe{fe}_{prefix}.npy",df_)

100%|██████████| 10/10 [00:01<00:00,  7.13it/s]
