In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import xgboost as xgb



In [5]:
data = pd.read_csv('/data/coding/chaochuan/TSGym/dataset/electricity/electricity.csv')
data

Unnamed: 0,date,0,1,2,3,4,5,6,7,8,...,311,312,313,314,315,316,317,318,319,OT
0,2016-07-01 02:00:00,14.0,69.0,234.0,415.0,215.0,1056.0,29.0,840.0,226.0,...,676.0,372.0,80100.0,4719.0,5002.0,48.0,38.0,1558.0,182.0,2162.0
1,2016-07-01 03:00:00,18.0,92.0,312.0,556.0,292.0,1363.0,29.0,1102.0,271.0,...,805.0,452.0,95200.0,4643.0,6617.0,65.0,47.0,2177.0,253.0,2835.0
2,2016-07-01 04:00:00,21.0,96.0,312.0,560.0,272.0,1240.0,29.0,1025.0,270.0,...,817.0,430.0,96600.0,4285.0,6571.0,64.0,43.0,2193.0,218.0,2764.0
3,2016-07-01 05:00:00,20.0,92.0,312.0,443.0,213.0,845.0,24.0,833.0,179.0,...,801.0,291.0,94500.0,4222.0,6365.0,65.0,39.0,1315.0,195.0,2735.0
4,2016-07-01 06:00:00,22.0,91.0,312.0,346.0,190.0,647.0,16.0,733.0,186.0,...,807.0,279.0,91300.0,4116.0,6298.0,75.0,40.0,1378.0,191.0,2721.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26299,2019-07-01 21:00:00,11.0,116.0,8.0,844.0,384.0,1590.0,51.0,1412.0,407.0,...,1897.0,1589.0,166500.0,9917.0,10412.0,324.0,21.0,1870.0,162.0,2773.0
26300,2019-07-01 22:00:00,11.0,103.0,8.0,749.0,371.0,1366.0,47.0,1265.0,369.0,...,1374.0,1336.0,158800.0,6812.0,8956.0,302.0,20.0,1506.0,438.0,2755.0
26301,2019-07-01 23:00:00,12.0,93.0,8.0,650.0,346.0,1282.0,48.0,1079.0,308.0,...,938.0,1311.0,154300.0,6602.0,5910.0,302.0,18.0,1864.0,621.0,2650.0
26302,2019-07-02 00:00:00,10.0,92.0,8.0,646.0,349.0,1261.0,48.0,1009.0,288.0,...,833.0,1227.0,141900.0,6546.0,5502.0,259.0,33.0,2623.0,783.0,2719.0


In [2]:
def get_meta_feature(dataset_name):
    meta_feature_dir_path = '/data/coding/chaochuan/cc/meta_learner/meta_feature'
    meta_feature_path = os.path.join(meta_feature_dir_path,f'meta_feature_{dataset_name}.npz')
    meta_feature = np.load(meta_feature_path)['meta_feature']
    return meta_feature

In [3]:
def get_meta_feature_scaler(data):
    meta_feature_list = []
    for dataset_name in list(set(data['dataset'].values)):
        meta_feature = get_meta_feature(dataset_name)
        meta_feature_list.append(meta_feature)
    scaler_meta_feature = MinMaxScaler(clip=True).fit(np.array(meta_feature_list))
    return scaler_meta_feature


In [7]:

def run(test_dataset,metrics):
    result_root_path = '/data/coding/chaochuan/TSGym/resultsGym'
    result_file_path = [os.path.join(result_root_path,x,'metrics.npy') for x in os.listdir(result_root_path)]
    data = pd.DataFrame(
        columns=['series_sampling','series_norm', \
            'series_decomp','channel_independent','input_embed','network_architecture', \
                'attention','encoder_only','dataset','seq_len','label_len','pred_len','mae','mse','rmse','mape','mspe'])

    for i in range(len(result_file_path)):
        setting = result_file_path[i].split('/')[-2]
        gym_row = {}
        _,_,_,gym_row['dataset'],_,_,_, gym_row['series_sampling'], gym_row['series_norm'], gym_row['series_decomp'], \
            gym_row['channel_independent'], gym_row['input_embed'], gym_row['network_architecture'], \
                gym_row['attention'], gym_row['encoder_only'], _,_,gym_row['seq_len'], gym_row['label_len'],gym_row['pred_len'], \
                    _,_,_,_,_,_,_,_,_,_,_,_,_,_  = setting.split('_')

        gym_row['mae'], gym_row['mse'], gym_row['rmse'], gym_row['mape'], gym_row['mspe'] = np.load(result_file_path[i])

        data.loc[i] = gym_row

    rank_metrics = f'{metrics}_rank'

    categorical_columns = [x for x in data.columns.tolist() if x not in {'dataset','mae','mse','rmse','mspe','mape'}]

    # 对每一列进行编码
    for col in categorical_columns:
        label_encoder = LabelEncoder()
        data[col] = label_encoder.fit_transform(data[col])

    data[rank_metrics] = data.groupby('dataset')[metrics].rank(method='min', ascending=True)
    max_rank = data[rank_metrics].max()
    min_rank = data[rank_metrics].min()
    data[rank_metrics] = (data[rank_metrics] - min_rank) / (max_rank - min_rank)


    data_train = data[data.dataset != test_dataset].reset_index(drop=True)
    data_test = data[data.dataset == test_dataset].reset_index(drop=True)
    model_train = data_train.drop(columns=['mae','mse','rmse','mape','dataset',rank_metrics]).values
    data_meta_feature_train = np.vstack([get_meta_feature(dataset_name) for dataset_name in data_train.dataset.values])
    scaler_meta_feature = MinMaxScaler(clip=True).fit(data_meta_feature_train)
    data_meta_feature_train = scaler_meta_feature.transform(data_meta_feature_train)
    X_train = np.concatenate([model_train,data_meta_feature_train],axis=1)
    y_train = data_train[rank_metrics].values

    model_test = data_test.drop(columns=['mae','mse','rmse','mape','dataset',rank_metrics]).values
    data_meta_feature_test = np.vstack([get_meta_feature(dataset_name) for dataset_name in data_test.dataset.values])
    data_meta_feature_test = scaler_meta_feature.transform(data_meta_feature_test)
    X_test = np.concatenate([model_test,data_meta_feature_test],axis=1)
    y_test = data_test[rank_metrics].values

    model = xgb.XGBRegressor(random_state=0).fit(X_train,y_train)

    best_index = np.argmin(model.predict(X_test))
    meta_rank = data_test[metrics].rank(ascending=False)[best_index]
    meta_mse = data_test['mse'].values[best_index]
    meta_mae = data_test['mae'].values[best_index]
    meta_rmse = data_test['rmse'].values[best_index]
    meta_mape = data_test['mape'].values[best_index]

    print(f'test dataset:{test_dataset}, metrics:{metrics}, total sample:{data_test.shape[0]}, meta rank:{meta_rank}, mse:{meta_mse}, mae:{meta_mae}, rmse:{meta_rmse}, mape:{meta_mape}')

In [5]:
metrics = 'mse'
for test_dataset in ['ETTh1','ETTh2', 'ETTm1', 'ETTm2','Exchange', 'weather']:
    run(test_dataset,metrics)

test dataset:ETTh1, metrics:mse, total sample:50, meta rank:11.0, mse:0.6600507497787476, mae:0.5984930992126465, rmse:0.8124350905418396, mape:7.263139724731445
test dataset:ETTh2, metrics:mse, total sample:49, meta rank:7.0, mse:1.9179527759552002, mae:0.959929883480072, rmse:1.384901762008667, mape:2.076181173324585
test dataset:ETTm1, metrics:mse, total sample:36, meta rank:13.0, mse:0.48412421345710754, mae:0.43085214495658875, rmse:0.6957903504371643, mape:2.4281387329101562
test dataset:ETTm2, metrics:mse, total sample:32, meta rank:30.0, mse:0.1780061274766922, mae:0.26539143919944763, rmse:0.4219077229499817, mape:1.1562962532043457
test dataset:Exchange, metrics:mse, total sample:637, meta rank:574.0, mse:0.08475180715322495, mae:0.20222881436347961, rmse:0.29112163186073303, mape:1.248151183128357
test dataset:weather, metrics:mse, total sample:64, meta rank:27.0, mse:0.20686881244182587, mae:0.23969657719135284, rmse:0.45482832193374634, mape:16.576509475708008


In [8]:
metrics = 'mse'
for test_dataset in ['ETTh1','ETTh2', 'ETTm1', 'ETTm2','Exchange', 'weather']:
    run(test_dataset,metrics)

test dataset:ETTh1, metrics:mse, total sample:51, meta rank:8.0, mse:0.7879704833030701, mae:0.6832336187362671, rmse:0.887677013874054, mape:7.202232360839844
test dataset:ETTh2, metrics:mse, total sample:50, meta rank:39.0, mse:0.30681735277175903, mae:0.35813650488853455, rmse:0.5539109706878662, mape:1.5479611158370972
test dataset:ETTm1, metrics:mse, total sample:37, meta rank:14.0, mse:0.48412421345710754, mae:0.43085214495658875, rmse:0.6957903504371643, mape:2.4281387329101562
test dataset:ETTm2, metrics:mse, total sample:33, meta rank:32.0, mse:0.17703893780708313, mae:0.2625526785850525, rmse:0.4207599461078644, mape:1.159519076347351
test dataset:Exchange, metrics:mse, total sample:642, meta rank:553.5, mse:0.08581973612308502, mae:0.2040690779685974, rmse:0.2929500639438629, mape:1.2417428493499756
test dataset:weather, metrics:mse, total sample:64, meta rank:28.0, mse:0.20328393578529358, mae:0.2760900557041168, rmse:0.45087018609046936, mape:14.182808876037598
