In [1]:
# =============================
# library
# =============================
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import polars as pl
from sklearn.preprocessing import StandardScaler
import glob
from tqdm import tqdm

In [2]:
# =============================
# constant
# =============================
TRAIN_PATH = Path("../storage/leap/data/train.csv")
TEST_PATH = Path("../storage/leap/data/test.csv")
OUTPUT_DIR = Path("../storage/leap/output")
DATA_DIR = Path("../storage/leap/data")
TRAIN_DATA6_DIR = Path("../storage/leap/data/train_0006")
TRAIN_DATA7_DIR = Path("../storage/leap/data/train_0007")

In [3]:
# =============================
# settings
# =============================
fe = "141"
fe_dir = OUTPUT_DIR / "fe" / f"fe{fe}"
fe_dir.mkdir(parents=True, exist_ok=True)
fe_save_dir = fe_dir / "save"
fe_save_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# =============================
# columns
# =============================
state_t = [f'state_t_{i}' for i in range(60)]
state_q0001 = [f'state_q0001_{i}' for i in range(60)]
state_q0002 = [f'state_q0002_{i}' for i in range(60)]
state_q0003 = [f'state_q0003_{i}' for i in range(60)]
state_u = [f'state_u_{i}' for i in range(60)]
state_v = [f'state_v_{i}' for i in range(60)]
other = ['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX',
       'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR',
       'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC',
       'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHLAND']
pbuf_ozone = [f'pbuf_ozone_{i}' for i in range(60)]
pbuf_CH4 = [f'pbuf_CH4_{i}' for i in range(60)]
pbuf_N2O = [f'pbuf_N2O_{i}' for i in range(60)]
cols_list = [state_t,state_q0001,state_q0002,state_q0003,
             state_u,state_v,pbuf_ozone,pbuf_CH4,pbuf_N2O,other]

In [5]:
# =============================
# target
# =============================
ptend_t = [f'ptend_t_{i}' for i in range(60)]
ptend_q0001 = [f'ptend_q0001_{i}' for i in range(60)]
ptend_q0002 = [f'ptend_q0002_{i}' for i in range(60)]
ptend_q0003 = [f'ptend_q0003_{i}' for i in range(60)]
ptend_u = [f'ptend_u_{i}' for i in range(60)]
ptend_v = [f'ptend_v_{i}' for i in range(60)]
other_target= ['cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECSC', 'cam_out_PRECC', 'cam_out_SOLS', 'cam_out_SOLL', 'cam_out_SOLSD', 'cam_out_SOLLD']
target_list = [ptend_t,ptend_q0001, ptend_q0002,ptend_q0003,ptend_u,ptend_v,other_target]

In [6]:
# =============================
# main
# =============================
# 2分の1に間引く
df_all = []
for i in tqdm(range(1, 13)):
    df = pl.read_parquet(f"{TRAIN_DATA6_DIR}/train_0006_{i}.parquet")
    print(i, df.shape[0])
    df = df.with_columns(pl.Series("group", np.arange(len(df))))
    df = df.with_columns((pl.col("group") // 384).alias("group"))
    df = df.with_columns((pl.col("group") % 2).alias("group"))
    df = df.filter(pl.col("group") == 1).drop("group")
    df_all.append(df)

# train_0007データの読み込みと処理
for i in tqdm(range(1, 9)):
    df = pl.read_parquet(f"{TRAIN_DATA7_DIR}/train_0007_{i}.parquet")
    if i == 8:
        df = df.head(609024)
    print(i, df.shape[0])
    df = df.with_columns(pl.Series("group", np.arange(len(df))))
    df = df.with_columns((pl.col("group") // 384).alias("group"))
    df = df.with_columns((pl.col("group") % 2).alias("group"))
    df = df.filter(pl.col("group") == 1).drop("group")
    df_all.append(df)

  0%|          | 0/12 [00:00<?, ?it/s]

1 857088


  8%|▊         | 1/12 [00:01<00:16,  1.46s/it]

2 774144


 17%|█▋        | 2/12 [00:02<00:13,  1.34s/it]

3 857088


 25%|██▌       | 3/12 [00:04<00:12,  1.40s/it]

4 829440


 33%|███▎      | 4/12 [00:05<00:10,  1.35s/it]

5 857088


 42%|████▏     | 5/12 [00:06<00:09,  1.30s/it]

6 829440


 50%|█████     | 6/12 [00:07<00:07,  1.25s/it]

7 857088


 58%|█████▊    | 7/12 [00:08<00:06,  1.22s/it]

8 857088


 67%|██████▋   | 8/12 [00:10<00:05,  1.25s/it]

9 829440


 75%|███████▌  | 9/12 [00:12<00:04,  1.40s/it]

10 857088


 83%|████████▎ | 10/12 [00:13<00:03,  1.53s/it]

11 829440


 92%|█████████▏| 11/12 [00:15<00:01,  1.62s/it]

12 857088


100%|██████████| 12/12 [00:17<00:00,  1.45s/it]
  0%|          | 0/8 [00:00<?, ?it/s]

1 857088


 12%|█▎        | 1/8 [00:01<00:12,  1.83s/it]

2 774144


 25%|██▌       | 2/8 [00:03<00:10,  1.78s/it]

3 857088


 38%|███▊      | 3/8 [00:05<00:09,  1.87s/it]

4 829440


 50%|█████     | 4/8 [00:07<00:07,  1.86s/it]

5 857088


 62%|██████▎   | 5/8 [00:09<00:05,  1.83s/it]

6 829440


 75%|███████▌  | 6/8 [00:10<00:03,  1.79s/it]

7 857088


 88%|████████▊ | 7/8 [00:12<00:01,  1.81s/it]

8 609024


100%|██████████| 8/8 [00:14<00:00,  1.81s/it]


In [7]:
df_all = pl.concat(df_all)

In [8]:
sample = pd.read_csv(DATA_DIR / "sample_submission.csv")
sample = sample.iloc[:1,:].reset_index(drop=True)
for c in sample.columns[1:]:
    w = sample[c].values
    df_all = df_all.with_columns(pl.col(c) * w)

In [9]:
print(df_all.shape)

(8280960, 925)


In [10]:
df_all.write_parquet(OUTPUT_DIR / "fe" / f"fe{fe}" / f"fe{fe}_train.parquet")