In [3]:
import os
import pickle
import numpy as np
import pandas as pd
import xarray as xr

import torch
import torch.utils.data as data
from tqdm import tqdm
from os import listdir
from os.path import join

import datatable as dt
from itertools import product
from sklearn.preprocessing import LabelEncoder

from utils import path_list, coordinate_matching, calc_chan_mean, calc_grid_mean, dummy_and_add_feature, feature_encoding

import warnings
warnings.filterwarnings(action='ignore')


folder_path = '/data/COMPUTER_VISION/_wizai/'
scaler_path = '/data/COMPUTER_VISION/AMSU_PREP/Data/TETE_min-max_scaler.pickle'

select_list = [4,5,6,7,8,9,10,11,12,13]

SET_range= {
        "TRAIN": ["202106", "20210710"],
        "VALID": ["20210711", "20210720"],
        "TEST": ["20210721", "20210731"],
    }


In [None]:
if os.path.exists('Data') is False:
    os.makedirs('Data')
    
for save_path in ['/data/COMPUTER_VISION/AMSU_PREP/Data/Normal', '/data/COMPUTER_VISION/AMSU_PREP/Data/Abnormal']:
    if os.path.exists(save_path) is False:
            os.makedirs(save_path)
            
not_scale_col = ['nchan', 'sat_id', 'scanpos', 'lat', 'lon', 'REAL_QC', 'grid', 'chan_lat_mean', 'chan_lon_mean', 'grid_lat_mean', 'grid_lon_mean',
                        'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'sin_month', 'cos_month', 'chqcflag_-999', 'chqcflag_0']
scale_col = ['bias_pred', 'obsTB', 'innov', 'chan_bias_pred_mean', 'chan_obsTB_mean', 'chan_innov_mean', 'grid_bias_pred_mean', 'grid_obsTB_mean', 'grid_innov_mean']
temp_dict = {key:[np.nan,np.nan] for key in scale_col}

if not os.path.exists(scaler_path):
    print('scaler')
    for MODE in ['TRAIN', 'VALID']:
        in4bc_path_list, thinn_path_list = path_list(folder_path, SET_range, MODE)

        for in4bc_path, InnQC2_path in tqdm(zip(in4bc_path_list, thinn_path_list), total=len(thinn_path_list)):
        
            # in4bc
            in4bc_open_netcdf = xr.open_dataset(in4bc_path)
            in4bc_xrdataset = in4bc_open_netcdf.to_dataframe().reset_index()
            in4bc_xrdataset = in4bc_xrdataset[in4bc_xrdataset['npredictors']==0][['nchans', 'sat_id', 'scanpos', 'lat', 'lon', 'chqcflag', 'bias_pred', 'obsTB', 'innov']].reset_index(drop=True)
            in4bc_xrdataset.rename(columns={'nchans':'nchan'}, inplace=True)
            in4bc_xrdataset = in4bc_xrdataset[in4bc_xrdataset['nchan'].isin(select_list)].reset_index(drop=True)

            # pre-thinning
            pre_thinning_df = dt.fread(InnQC2_path, encoding = "utf-8").to_pandas().iloc[:,1:]
            pre_thinning_df = pre_thinning_df[pre_thinning_df['irej'] == 0].reset_index(drop=True)
            pre_thinning_df['lat'] = pre_thinning_df['lat'].astype(np.float32)
            pre_thinning_df['lon'] = pre_thinning_df['lon'].astype(np.float32)
            pre_thinning_df.drop(['irej', 'isat', 'bpos', 'QCflag', 'sfctype', 'obstdif', 
                                'ob(01)', 'ob(02)', 'ob(03)', 'ob(04)', 'ob(05)', 'ob(06)', 'ob(07)', 'ob(08)', 'ob(09)','ob(10)', 'ob(11)', 'ob(12)','ob(13)', 'ob(14)', 'ob(15)',
                                'bk(01)', 'bk(02)', 'bk(03)', 'bk(04)', 'bk(05)', 'bk(06)', 'bk(07)', 'bk(08)', 'bk(09)','bk(10)', 'bk(11)', 'bk(12)','bk(13)', 'bk(14)', 'bk(15)',
                                'cob(01)', 'cob(02)', 'cob(03)', 'cob(04)', 'cob(05)', 'cob(06)', 'cob(07)', 'cob(08)', 'cob(09)','cob(10)', 'cob(11)', 'cob(12)','cob(13)', 'cob(14)', 'cob(15)',
                                'ck(01)', 'ck(02)', 'ck(03)', 'ck(04)', 'ck(15)'], axis=1, inplace=True)
            
            in4bc_df_labeling = coordinate_matching(in4bc_xrdataset, pre_thinning_df, select_list)
            in4bc_df_labeling_class = calc_chan_mean(in4bc_df_labeling, select_list)
            in4bc_df_grid = calc_grid_mean(in4bc_df_labeling_class, select_list)   
            in4bc_df_grid[['sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'sin_month', 'cos_month']] = dummy_and_add_feature(in4bc_path)
            in4bc_df_encoding = feature_encoding(in4bc_df_grid)
            
            del in4bc_open_netcdf; del in4bc_xrdataset; del pre_thinning_df; del in4bc_df_labeling; del in4bc_df_labeling_class; del in4bc_df_grid;
                    
            # stn마다 변수별 최대, 최소값 구하기
            max_arr = in4bc_df_encoding[scale_col].max().values
            min_arr = in4bc_df_encoding[scale_col].min().values
                
            # 사전에 변수별 최대, 최소값 저장
            min_max_dict = {scale_col[i]:[min_arr[i], max_arr[i]] for i in range(len(scale_col))}
            
            for key in temp_dict.keys():
                temp_dict[key][0]=min(min_max_dict[key][0], temp_dict[key][0])
                temp_dict[key][1]=max(min_max_dict[key][1], temp_dict[key][1])
                
    with open(scaler_path, 'wb') as fw:
        pickle.dump(temp_dict, fw)
    print("scaler is saved at {}".format(scaler_path))
                
else:
    for MODE in ['TRAIN', 'VALID', 'TEST']:
        in4bc_path_list, thinn_path_list = path_list(folder_path, SET_range, MODE)

        for in4bc_path, InnQC2_path in tqdm(zip(in4bc_path_list, thinn_path_list), total=len(thinn_path_list)):
        
            # in4bc
            in4bc_open_netcdf = xr.open_dataset(in4bc_path)
            in4bc_xrdataset = in4bc_open_netcdf.to_dataframe().reset_index()
            in4bc_xrdataset = in4bc_xrdataset[in4bc_xrdataset['npredictors']==0][['nchans', 'sat_id', 'scanpos', 'lat', 'lon', 'chqcflag', 'bias_pred', 'obsTB', 'innov']].reset_index(drop=True)
            in4bc_xrdataset.rename(columns={'nchans':'nchan'}, inplace=True)
            in4bc_xrdataset = in4bc_xrdataset[in4bc_xrdataset['nchan'].isin(select_list)].reset_index(drop=True)

            # pre-thinning
            pre_thinning_df = dt.fread(InnQC2_path, encoding = "utf-8").to_pandas().iloc[:,1:]
            pre_thinning_df = pre_thinning_df[pre_thinning_df['irej'] == 0].reset_index(drop=True)
            pre_thinning_df['lat'] = pre_thinning_df['lat'].astype(np.float32)
            pre_thinning_df['lon'] = pre_thinning_df['lon'].astype(np.float32)
            pre_thinning_df.drop(['irej', 'isat', 'bpos', 'QCflag', 'sfctype', 'obstdif', 
                                'ob(01)', 'ob(02)', 'ob(03)', 'ob(04)', 'ob(05)', 'ob(06)', 'ob(07)', 'ob(08)', 'ob(09)','ob(10)', 'ob(11)', 'ob(12)','ob(13)', 'ob(14)', 'ob(15)',
                                'bk(01)', 'bk(02)', 'bk(03)', 'bk(04)', 'bk(05)', 'bk(06)', 'bk(07)', 'bk(08)', 'bk(09)','bk(10)', 'bk(11)', 'bk(12)','bk(13)', 'bk(14)', 'bk(15)',
                                'cob(01)', 'cob(02)', 'cob(03)', 'cob(04)', 'cob(05)', 'cob(06)', 'cob(07)', 'cob(08)', 'cob(09)','cob(10)', 'cob(11)', 'cob(12)','cob(13)', 'cob(14)', 'cob(15)',
                                'ck(01)', 'ck(02)', 'ck(03)', 'ck(04)', 'ck(15)'], axis=1, inplace=True)
            
            in4bc_df_labeling = coordinate_matching(in4bc_xrdataset, pre_thinning_df, select_list)
            in4bc_df_labeling_class = calc_chan_mean(in4bc_df_labeling, select_list)
            in4bc_df_grid = calc_grid_mean(in4bc_df_labeling_class, select_list)   
            in4bc_df_grid[['sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'sin_month', 'cos_month']] = dummy_and_add_feature(in4bc_path)
            in4bc_df_encoding = feature_encoding(in4bc_df_grid)
            
            del in4bc_open_netcdf; del in4bc_xrdataset; del pre_thinning_df; del in4bc_df_labeling; del in4bc_df_labeling_class; del in4bc_df_grid;
            
            with open(scaler_path, 'rb') as fr:
                min_max_dict = pickle.load(fr)
        
            # min-max scaling
            for col, (col_min, col_max) in min_max_dict.items():
                in4bc_df_encoding[col] = in4bc_df_encoding[col] - col_min
                in4bc_df_encoding[col] = in4bc_df_encoding[col] / (col_max-col_min)

            in4bc_df_encoding['lat'] = in4bc_df_encoding['lat']/90
            in4bc_df_encoding['lon'] = in4bc_df_encoding['lon']/360
            in4bc_df_encoding['chan_lat_mean'] = in4bc_df_encoding['chan_lat_mean']/90
            in4bc_df_encoding['chan_lon_mean'] = in4bc_df_encoding['chan_lon_mean']/360
            in4bc_df_encoding['grid_lat_mean'] = in4bc_df_encoding['grid_lat_mean']/90
            in4bc_df_encoding['grid_lon_mean'] = in4bc_df_encoding['grid_lon_mean']/360
            
            normal_data   = in4bc_df_encoding[in4bc_df_encoding['REAL_QC'] == 0.0].fillna(1).reset_index(drop=True)
            abnormal_data = in4bc_df_encoding[in4bc_df_encoding['REAL_QC'] == 7.0].fillna(1).reset_index(drop=True)
            
            normal_data.drop(['REAL_QC'], axis=1, inplace=True)
            abnormal_data.drop(['REAL_QC'], axis=1, inplace=True)
            
            torch.save(normal_data, "/data/COMPUTER_VISION/AMSU_PREP/Data/Normal/{}_{}_Normal_data.pkl".format(in4bc_path.split("_")[-1].split(".")[0], MODE), pickle_module=pickle)
            torch.save(abnormal_data, "/data/COMPUTER_VISION/AMSU_PREP/Data/Abnormal/{}_{}_Abnormal_data.pkl".format(in4bc_path.split("_")[-1].split(".")[0], MODE), pickle_module=pickle)
            del in4bc_df_encoding; del normal_data; del abnormal_data;
            
print('Done!')