In [17]:
# =============================
# library
# =============================
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import polars as pl
from sklearn.preprocessing import StandardScaler
import glob
from tqdm import tqdm

In [7]:
# =============================
# constant
# =============================
TRAIN_PATH = Path("../storage/leap/data/train.csv")
TEST_PATH = Path("../storage/leap/data/test.csv")
OUTPUT_DIR = Path("../storage/leap/output")
DATA_DIR = Path("../storage/leap/data")
TRAIN_DATA8_DIR = Path("../storage/leap/data/train_0008")

In [3]:
# =============================
# settings
# =============================
fe = "055"
fe_dir = OUTPUT_DIR / "fe" / f"fe{fe}"
fe_dir.mkdir(parents=True, exist_ok=True)
fe_save_dir = fe_dir / "save"
fe_save_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# =============================
# columns
# =============================
state_t = [f'state_t_{i}' for i in range(60)]
state_q0001 = [f'state_q0001_{i}' for i in range(60)]
state_q0002 = [f'state_q0002_{i}' for i in range(60)]
state_q0003 = [f'state_q0003_{i}' for i in range(60)]
state_u = [f'state_u_{i}' for i in range(60)]
state_v = [f'state_v_{i}' for i in range(60)]
other = ['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX',
       'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR',
       'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC',
       'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHLAND']
pbuf_ozone = [f'pbuf_ozone_{i}' for i in range(60)]
pbuf_CH4 = [f'pbuf_CH4_{i}' for i in range(60)]
pbuf_N2O = [f'pbuf_N2O_{i}' for i in range(60)]
cols_list = [state_t,state_q0001,state_q0002,state_q0003,
             state_u,state_v,pbuf_ozone,pbuf_CH4,pbuf_N2O,other]

In [5]:
# =============================
# target
# =============================
ptend_t = [f'ptend_t_{i}' for i in range(60)]
ptend_q0001 = [f'ptend_q0001_{i}' for i in range(60)]
ptend_q0002 = [f'ptend_q0002_{i}' for i in range(60)]
ptend_q0003 = [f'ptend_q0003_{i}' for i in range(60)]
ptend_u = [f'ptend_u_{i}' for i in range(60)]
ptend_v = [f'ptend_v_{i}' for i in range(60)]
other_target= ['cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECSC', 'cam_out_PRECC', 'cam_out_SOLS', 'cam_out_SOLL', 'cam_out_SOLSD', 'cam_out_SOLLD']
target_list = [ptend_t,ptend_q0001, ptend_q0002,ptend_q0003,ptend_u,ptend_v,other_target]

In [18]:
# =============================
# main
# =============================
# 2分の1に間引く
df_all = []
for i in tqdm(range(2,13)):
    df = pd.read_parquet(TRAIN_DATA8_DIR / f"train_0008_{i}.parquet")
    df["group"] = np.arange(len(df))
    df["group"] = df["group"] // 384
    df["group"] = df["group"] % 2
    df = df[df["group"] == 0].reset_index(drop=True)
    df = df.drop("group",axis=1)
    df_all.append(df)

100%|██████████| 11/11 [01:35<00:00,  8.70s/it]


In [19]:
df = pd.concat(df_all).reset_index(drop=True)

In [21]:
print(df.shape)

(4617216, 925)


In [22]:
df.to_parquet("../storage/leap/data/train_0008/train_0008_2_concat.parquet")

In [23]:
df

Unnamed: 0,sample_id,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,...,ptend_v_58,ptend_v_59,cam_out_NETSW,cam_out_FLWDS,cam_out_PRECSC,cam_out_PRECC,cam_out_SOLS,cam_out_SOLL,cam_out_SOLSD,cam_out_SOLLD
0,train_0,213.015399,222.982504,229.060155,244.718725,256.363167,264.674508,263.137177,253.361152,242.912910,...,-0.000001,1.115369e-06,0.000000,374.326122,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000
1,train_1,212.928805,219.215263,226.877896,245.088931,257.724639,266.303036,265.122610,255.492992,243.913295,...,0.000023,1.271922e-06,0.000000,368.961544,0.000000e+00,2.488945e-09,0.000000e+00,0.000000e+00,0.000000,0.000000
2,train_2,213.879147,230.127471,232.440488,243.798987,253.231005,259.569060,257.504798,249.007208,238.955028,...,-0.000012,4.925583e-06,0.000000,413.329358,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000
3,train_3,212.822607,221.729062,227.381841,242.259876,253.111267,261.355935,259.292877,250.935204,240.321327,...,-0.000038,-4.142567e-05,0.000000,417.334622,0.000000e+00,1.329662e-07,0.000000e+00,0.000000e+00,0.000000,0.000000
4,train_4,214.122860,219.787635,227.753731,244.986192,258.572526,266.770542,265.346858,256.232703,244.570049,...,0.000022,-1.829278e-05,0.000000,365.823816,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4617211,train_856699,221.187103,226.352975,236.535549,244.926666,248.612010,246.259331,238.503070,232.078002,226.674986,...,-0.000008,1.091362e-05,37.786375,321.328592,1.782172e-08,3.415752e-08,1.186082e-07,6.769715e-07,23.788084,16.911529
4617212,train_856700,208.503157,228.572727,232.604764,240.169234,244.444724,245.957780,238.617967,232.626109,227.378977,...,0.000001,-5.317392e-07,15.257785,180.084729,1.019126e-08,1.019126e-08,5.215061e-01,2.195795e+00,15.543283,14.388648
4617213,train_856701,211.498267,233.867707,238.702060,240.938410,244.798610,247.414092,240.465712,234.161907,228.329206,...,0.000002,-2.669192e-06,9.545120,203.905765,6.515243e-09,6.515243e-09,5.748349e-02,3.592968e-01,12.281326,11.617389
4617214,train_856702,213.687595,224.331743,231.563532,241.531658,247.590700,247.361025,239.942349,233.966628,228.329086,...,0.000013,-4.239906e-05,115.202187,268.925340,1.220883e-08,1.558527e-08,1.890876e+01,5.509787e+01,37.744758,21.810664
