In [13]:
import pandas as pd
import numpy as np
import os
from IPython.display import display
from sklearn.model_selection import train_test_split
from tensorflow.random import set_seed
set_seed(42)

import missingno as msno
%matplotlib inline

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from fancyimpute import SoftImpute, IterativeSVD, MatrixFactorization, NuclearNormMinimization, BiScaler, SimilarityWeightedAveraging
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# # By default, a module has some hidden variables defined
# print({k: v for k, v in globals().items() if not k.startswith("__")})

In [3]:
''' 
data dim = [8, 8, 13, 8, 4, 11, 6]
train size = [765, 574, 371, 6064, 7077, 1187, 231]
test size = [309, 230, 151, 2457, 2870, 479, 92]
train / test ~= 2.5
'''
for i in range(1,8,1):
    temp0_df = pd.read_csv(f"data{i}/train.csv", header=None)
    globals()[f'train{i}'] = temp0_df.iloc[:,:-1]
    globals()[f'answer{i}'] = temp0_df.iloc[:,-1]
    temp1_df = pd.read_csv(f"data{i}/test.csv", header=None)
    globals()[f'test{i}'] = temp1_df.iloc[:,:-1]

In [17]:
for i in range(1,8,1):
    count_null = pd.DataFrame(globals()[f'train{i}'].isnull().sum()).T
    display(count_null)

Unnamed: 0,0,1,2,3,4,5,6,7
0,234,220,218,225,219,239,249,237


Unnamed: 0,0,1,2,3,4,5,6,7
0,167,161,195,179,151,163,167,173


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,110,124,103,103,105,118,124,122,118,105,104,110,102


Unnamed: 0,0,1,2,3,4,5,6,7
0,1755,1864,1834,1811,1764,1789,1850,1893


Unnamed: 0,0,1,2,3
0,2132,2149,2089,2114


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,367,395,362,357,351,327,342,367,346,366,321


Unnamed: 0,0,1,2,3,4,5
0,68,62,74,63,78,68


In [21]:
for i in range(1,8,1):
    count_unique = pd.DataFrame(globals()[f'train{i}'])
    display(count_unique.agg(['nunique']))

Unnamed: 0,0,1,2,3,4,5,6,7
nunique,216,141,104,151,89,204,224,14


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,12,12,7,4,2,4,4,6


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
nunique,259,23,63,2,74,239,204,226,9,51,40,189,252


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,4013,3960,3962,3982,4010,3999,3958,3900


Unnamed: 0,0,1,2,3
nunique,2302,611,2053,3241


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
nunique,90,127,78,76,121,52,131,98,82,81,55


Unnamed: 0,0,1,2,3,4,5
nunique,5,10,8,17,10,14


In [9]:
def make_upload(imputer, model = LGBMRegressor(random_state=42), status = False, test_size = 0.15, seed = 42):
    tp_mean = 0.0
    for i in range(1,8,1):
        np_train, np_answer = np.array(globals()[f'train{i}']), np.array(globals()[f'answer{i}'])
        X_train, X_valid, y_train, y_valid = train_test_split(np_train, np_answer, test_size=test_size, random_state=seed)
        ft_train = imputer.fit_transform(X_train)
        if "fancyimpute" in imputer.__module__:
            ft_valid = imputer.fit_transform(X_valid)
        else: ft_valid = imputer.transform(X_valid)

        model.fit(ft_train, y_train)
        output = model.predict(ft_valid)
        mae = mean_absolute_error(y_valid, output)
        tp_mean += mae
        print(mae, ft_valid.shape, end=' ')

        if status is True:
            if "fancyimpute" in imputer.__module__:
                ft_test = imputer.fit_transform(globals()[f'test{i}'])
            else: ft_test = imputer.transform(globals()[f'test{i}'])
            print(ft_test.shape, end='')
            predict = model.predict(ft_test)
            upload_array = np.concatenate((ft_test, predict[:, np.newaxis]), axis=1)
            upload = pd.DataFrame(upload_array)
            upload.to_csv(f'upload/{str(imputer)[:3]}_{str(model)[:4]}_upload{i}.csv', header=None, index=None, sep=',', mode='w')
        print()
    print("TP_mean in valid set = {}".format(tp_mean))

In [59]:
make_upload(KNNImputer(weights="distance"), LGBMRegressor(random_state=42), status = False)

8.105156441348806 (115, 8) 
2.1523932999874105 (87, 8) 
3.1572365896204713 (56, 13) 
0.17042330951206006 (910, 8) 
5.065000907165189 (1062, 4) 
0.5045179238899965 (179, 11) 
9.575812334679462 (35, 6) 
TP_mean in valid set = 28.730540806203393


In [82]:
# from sklearn.base import TransformerMixin
# class MyBiScaler(TransformerMixin):
#     def __init__(self, *args, **kwargs):
#         self.scaler = BiScaler(*args, **kwargs)
#     def fit(self, x, y = 0):
#         self.scaler.fit(x)
#         return self
#     def transform(self, x, y = 0):
#         return self.scaler.transform(x)

# pipe = Pipeline([('scaler', MyBiScaler()) , ('impute', SoftImpute())])
# make_upload(pipe, LGBMRegressor(random_state=42))

In [17]:
make_upload(SoftImpute(None, 1e-6, 100000, init_fill_method='median', verbose=False))

8.932662685808534 (115, 8) 
2.526380236267248 (87, 8) 
3.788081689133525 (56, 13) 
0.16000249227075466 (910, 8) 
4.545777806777244 (1062, 4) 
0.49509183441914323 (179, 11) 
9.396843384465067 (35, 6) 
TP_mean in valid set = 29.844840129141517


In [12]:
make_upload(MatrixFactorization(learning_rate=1e-4, patience=20, verbose=False), status=True)

9.032761549110756 (115, 8) (309, 8)
2.9157460398919475 (87, 8) (230, 8)
3.631553466895625 (56, 13) (151, 13)
0.1651152089201669 (910, 8) (2457, 8)
4.724872639144784 (1062, 4) (2870, 4)
0.505432653187008 (179, 11) (479, 11)
7.485568667825707 (35, 6) (92, 6)
TP_mean in valid set = 28.461050224975992


In [23]:
# Convert to .txt file
for i in range(1,8,1):
    PATH = "GRAPE/uci/raw_data"
    os.makedirs(f'{PATH}/data{i}/data', exist_ok=True)
    data = globals()[f'train{i}'].copy()
    data.fillna(value = 0, inplace=True)
    data.to_csv(f'{PATH}/data{i}/data/data.txt', header=None, index=None, sep=' ', mode='a')

In [24]:
'''
opt : adam, sgd, rmsprop, adagrad; opt_scheduler : step, cos
'''
!python GRAPE/train_mdi.py --epochs 200 --opt_scheduler cos --opt_decay_step 50 --opt_decay_rate 0.9 --weight_decay 1e-5 \
 --valid 0.1 --save_model --save_prediction uci --data data1

Namespace(aggr='mean', auto_known=False, concat_states=False, data='data1', domain='uci', dropout=0.0, edge_dim=64, edge_mode=1, epochs=200, gnn_activation='relu', impute_activation='relu', impute_hiddens='64', known=0.7, log_dir='0', loss_mode=0, lr=0.001, mode='train', model_types='EGSAGE_EGSAGE_EGSAGE', node_dim=64, node_mode=0, norm_embs=None, opt='adam', opt_decay_rate=0.9, opt_decay_step=50, opt_restart=0, opt_scheduler='cos', post_hiddens=None, save_model=True, save_prediction=True, seed=0, split_by='y', split_sample=0.0, split_test=False, split_train=False, train_edge=0.7, train_y=0.7, transfer_dir=None, transfer_extra='', valid=0.1, weight_decay=1e-05)
Using CPU
['EGSAGE', 'EGSAGE', 'EGSAGE'] [True, True, True] [64]
total trainable_parameters:  26
valid mask sum:  tensor(412)
train edge num is 6792, valid edge num is 824, test edge num is input 7616 output 3094
epoch:  0
loss:  0.268662691116333
valid rmse:  0.47830742179600355
valid l1:  0.3350312411785126
test rmse:  0.51036



valid l1:  0.3309483826160431
test rmse:  0.37034583766581564
test l1:  0.34325724840164185
epoch:  14
loss:  0.13191427290439606
valid rmse:  0.3604730540838353
valid l1:  0.33522874116897583
test rmse:  0.3699444893809753
test l1:  0.3447832465171814
epoch:  15
loss:  0.13245777785778046
valid rmse:  0.3641111063128452
valid l1:  0.33940500020980835
test rmse:  0.37125874916838786
test l1:  0.3465489447116852
epoch:  16
loss:  0.13416671752929688
valid rmse:  0.3678514078866456
valid l1:  0.34298738837242126
test rmse:  0.37325219301140344
test l1:  0.3482002913951874
epoch:  17
loss:  0.13619832694530487
valid rmse:  0.37050310613511644
valid l1:  0.34520283341407776
test rmse:  0.3748539600516265
test l1:  0.34924793243408203
epoch:  18
loss:  0.13773325085639954
valid rmse:  0.3713996228953631
valid l1:  0.34590768814086914
test rmse:  0.3754116024701768
test l1:  0.34960252046585083
epoch:  19
loss:  0.1382535845041275
valid rmse:  0.370507389404184
valid l1:  0.3451967239379883


In [25]:
!python GRAPE/train_y.py --epochs 200 --opt_scheduler cos --opt_decay_step 50 --opt_decay_rate 0.9 --weight_decay 1e-5 \
--valid 0.1 uci --data data1

Namespace(aggr='mean', concat_states=False, data='data1', domain='uci', dropout=0.0, edge_dim=16, edge_mode=1, epochs=200, gnn_activation='relu', impute_activation='relu', impute_hiddens='', known=0.7, log_dir='y0', lr=0.001, model_types='EGSAGE_EGSAGE', node_dim=16, node_mode=0, norm_embs=None, opt='adam', opt_decay_rate=0.9, opt_decay_step=50, opt_restart=0, opt_scheduler='cos', post_hiddens=None, predict_hiddens='', seed=0, split_by='y', split_sample=0.0, split_test=False, split_train=False, train_edge=0.7, train_y=0.7, valid=0.1, weight_decay=1e-05)
Using CPU
['EGSAGE', 'EGSAGE'] [True, True] [16]
all y num is 765, train num is 454, valid num is 69, test num is 242
epoch:  0
loss:  4414.4296875
valid rmse:  68.76597825687496
valid l1:  34.48906326293945
test rmse:  71.23226890119744
test l1:  34.76678466796875
epoch:  1
loss:  4413.12841796875
valid rmse:  68.75544367652826
valid l1:  34.474510192871094
test rmse:  71.22200999010944
test l1:  34.751747131347656
epoch:  2
loss:  441



valid l1:  34.405189514160156
test rmse:  71.13560924279591
test l1:  34.678565979003906
epoch:  12
loss:  4400.5888671875
valid rmse:  68.65812397874886
valid l1:  34.398555755615234
test rmse:  71.12740618183858
test l1:  34.671592712402344
epoch:  13
loss:  4399.5234375
valid rmse:  68.64979558304417
valid l1:  34.391990661621094
test rmse:  71.1193017269222
test l1:  34.664703369140625
epoch:  14
loss:  4398.4794921875
valid rmse:  68.64143416607057
valid l1:  34.38541030883789
test rmse:  71.11117918223196
test l1:  34.65778732299805
epoch:  15
loss:  4397.421875
valid rmse:  68.63295434319434
valid l1:  34.37873077392578
test rmse:  71.10295270108338
test l1:  34.650787353515625
epoch:  16
loss:  4396.36767578125
valid rmse:  68.6244058774373
valid l1:  34.37189865112305
test rmse:  71.09479051436364
test l1:  34.643802642822266
epoch:  17
loss:  4395.30322265625
valid rmse:  68.61563218734307
valid l1:  34.364906311035156
test rmse:  71.08640758787857
test l1:  34.63666152954101

In [27]:
# 檢視 Feature imputation 訓練數據
import pickle
fr = open('uci/test/data1/0/result.pkl', 'rb')
df = pickle.load(fr)

# print('Items of result.pkl: ', df.keys())
# print('\nargs: ', df['args'])
print('\noutputs: ', df['outputs'])
# print('\ncurves: ', df['curves'])
# print('\nlr: ', df['lr'])


outputs:  {'best_valid_rmse_pred_test': array([0.23472586, 0.3429135 , 0.49465045, ..., 0.48028755, 0.48039582,
       0.4804244 ], dtype=float32), 'best_valid_l1_pred_test': array([0.08135975, 0.3113982 , 0.68225396, ..., 0.58054477, 0.6309263 ,
       0.6309227 ], dtype=float32), 'final_pred_train': array([0.17951287, 0.0942234 , 0.3507589 , ..., 0.28867045, 0.20371333,
       0.6800189 ], dtype=float32), 'label_train': array([0.        , 0.        , 0.        , ..., 0.39130434, 0.        ,
       0.7671771 ], dtype=float32), 'final_pred_test': array([0.07680242, 0.29278088, 0.6853763 , ..., 0.5381242 , 0.627933  ,
       0.6432092 ], dtype=float32), 'label_test': array([0.       , 0.       , 0.       , ..., 0.7877193, 0.8798246,
       0.7650443], dtype=float32)}


In [36]:
print(df['outputs'].keys())
for key in df['outputs'].keys():
    print("{} ".format(df['outputs'][key].shape), end='')

dict_keys(['best_valid_rmse_pred_test', 'best_valid_l1_pred_test', 'final_pred_train', 'label_train', 'final_pred_test', 'label_test'])
(1547,) (1547,) (3396,) (3396,) (1547,) (1547,) 

In [37]:
# 檢視 Label prediction 訓練數據
fr = open('uci/test/data1/y0/result.pkl', 'rb')
df = pickle.load(fr)

# print('Items of result.pkl: ', df.keys())
# print('\nargs: ', df['args'])
print('\noutputs: ', df['outputs'])
# print('\ncurves: ', df['curves'])
# print('\nlr: ', df['lr'])


outputs:  {'pred_train': array([13.014936 , 13.015616 , 13.014899 , 13.015302 , 13.0140085,
       13.015451 , 13.014839 , 13.015679 , 13.015614 , 13.0152645,
       13.015746 , 13.015632 , 13.015731 , 13.015555 , 13.014531 ,
       13.015588 , 13.015418 , 13.013738 , 13.015618 , 13.015361 ,
       13.014699 , 13.01544  , 13.0152645, 13.015592 , 13.015386 ,
       13.015556 , 13.013534 , 13.014003 , 13.015568 , 13.01533  ,
       13.014372 , 13.015334 , 13.015154 , 13.015168 , 13.015094 ,
       13.012885 , 13.015652 , 13.0153265, 13.0153055, 13.015644 ,
       13.015033 , 13.015257 , 13.015002 , 13.015092 , 13.009801 ,
       13.015689 , 13.015589 , 13.015494 , 13.015029 , 13.014236 ,
       13.015471 , 13.01452  , 13.01531  , 13.015338 , 13.014725 ,
       13.014654 , 13.015282 , 13.015189 , 13.014003 , 13.01574  ,
       13.015374 , 13.015165 , 13.015111 , 13.015494 , 13.015279 ,
       13.012542 , 13.01546  , 13.015697 , 13.01562  , 13.015378 ,
       13.015109 , 13.013789 , 13.01

In [38]:
print(df['outputs'].keys())
for key in df['outputs'].keys():
    print("{} ".format(df['outputs'][key].shape), end='')

dict_keys(['pred_train', 'label_train', 'pred_test', 'label_test'])
(454,) (454,) (242,) (242,) 