In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display
from sklearn.model_selection import train_test_split
from tensorflow.random import set_seed
set_seed(42)

import missingno as msno
%matplotlib inline

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from fancyimpute import SoftImpute, IterativeSVD, MatrixFactorization, NuclearNormMinimization, BiScaler, SimilarityWeightedAveraging
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# # By default, a module has some hidden variables defined
# print({k: v for k, v in globals().items() if not k.startswith("__")})

In [3]:
''' 
data dim = [8, 8, 13, 8, 4, 11, 6]
train size = [765, 574, 371, 6064, 7077, 1187, 231]
test size = [309, 230, 151, 2457, 2870, 479, 92]
train / test ~= 2.5
'''
for i in range(1,8,1):
    temp0_df = pd.read_csv(f"data{i}/train.csv", header=None)
    globals()[f'train{i}'] = temp0_df.iloc[:,:-1]
    globals()[f'answer{i}'] = temp0_df.iloc[:,-1]
    temp1_df = pd.read_csv(f"data{i}/test.csv", header=None)
    globals()[f'test{i}'] = temp1_df.iloc[:,:-1]

In [17]:
for i in range(1,8,1):
    count_null = pd.DataFrame(globals()[f'train{i}'].isnull().sum()).T
    display(count_null)

Unnamed: 0,0,1,2,3,4,5,6,7
0,234,220,218,225,219,239,249,237


Unnamed: 0,0,1,2,3,4,5,6,7
0,167,161,195,179,151,163,167,173


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,110,124,103,103,105,118,124,122,118,105,104,110,102


Unnamed: 0,0,1,2,3,4,5,6,7
0,1755,1864,1834,1811,1764,1789,1850,1893


Unnamed: 0,0,1,2,3
0,2132,2149,2089,2114


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,367,395,362,357,351,327,342,367,346,366,321


Unnamed: 0,0,1,2,3,4,5
0,68,62,74,63,78,68


In [21]:
for i in range(1,8,1):
    count_unique = pd.DataFrame(globals()[f'train{i}'])
    display(count_unique.agg(['nunique']))

Unnamed: 0,0,1,2,3,4,5,6,7
nunique,216,141,104,151,89,204,224,14


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,12,12,7,4,2,4,4,6


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
nunique,259,23,63,2,74,239,204,226,9,51,40,189,252


Unnamed: 0,0,1,2,3,4,5,6,7
nunique,4013,3960,3962,3982,4010,3999,3958,3900


Unnamed: 0,0,1,2,3
nunique,2302,611,2053,3241


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
nunique,90,127,78,76,121,52,131,98,82,81,55


Unnamed: 0,0,1,2,3,4,5
nunique,5,10,8,17,10,14


In [5]:
def make_upload(imputer, model = LGBMRegressor(random_state=42), scaler = None, status = False, test_size = 0.15, seed = 42):
    tp_mean = 0.0
    for i in range(1,8,1):
        np_train, np_answer = np.array(globals()[f'train{i}']), np.array(globals()[f'answer{i}'])
        X_train, X_valid, y_train, y_valid = train_test_split(np_train, np_answer, test_size=test_size, random_state=seed)
        if scaler is True:
            X_train = scaler.fit_transform(X_train)
            X_valid = scaler.fit_transform(X_valid)

        ft_train = imputer.fit_transform(X_train)
        if "fancyimpute" in imputer.__module__:
            ft_valid = imputer.fit_transform(X_valid)
        else: ft_valid = imputer.transform(X_valid)

        model.fit(ft_train, y_train)
        output = model.predict(ft_valid)
        mae = mean_absolute_error(y_valid, output)
        tp_mean += mae
        print(mae, ft_valid.shape, end=' ')

        if status is True:
            X_test = globals()[f'test{i}']
            if scaler is True: scaler.fit_transform(X_test)
            if "fancyimpute" in imputer.__module__:
                ft_test = imputer.fit_transform(X_test)
            else: ft_test = imputer.transform(X_test)
            print(ft_test.shape, end='')
            predict = model.predict(ft_test)
            upload_array = np.concatenate((ft_test, predict[:, np.newaxis]), axis=1)
            upload = pd.DataFrame(upload_array)
            upload.to_csv(f'upload/{str(imputer)[:3]}_{str(model)[:4]}_upload{i}.csv', header=None, index=None, sep=',', mode='w')
        print()
    print("TP_mean in valid set = {}".format(tp_mean))

In [59]:
make_upload(KNNImputer(weights="distance"), LGBMRegressor(random_state=42), status = False)

8.105156441348806 (115, 8) 
2.1523932999874105 (87, 8) 
3.1572365896204713 (56, 13) 
0.17042330951206006 (910, 8) 
5.065000907165189 (1062, 4) 
0.5045179238899965 (179, 11) 
9.575812334679462 (35, 6) 
TP_mean in valid set = 28.730540806203393


In [17]:
make_upload(SoftImpute(None, 1e-5, 10000, max_rank=10, init_fill_method='zero', verbose=False), \
    scaler=BiScaler(scale_rows=False, scale_columns=False, verbose=False))

9.007934660291552 (115, 8) 
2.563748448025553 (87, 8) 
3.794182009673096 (56, 13) 
0.1622370950845623 (910, 8) 
4.619672741376433 (1062, 4) 
0.4897435537636039 (179, 11) 
9.299635389313757 (35, 6) 
TP_mean in valid set = 29.937153897528553


In [12]:
make_upload(MatrixFactorization(learning_rate=1e-4, patience=20, verbose=False), status=True)

9.032761549110756 (115, 8) (309, 8)
2.9157460398919475 (87, 8) (230, 8)
3.631553466895625 (56, 13) (151, 13)
0.1651152089201669 (910, 8) (2457, 8)
4.724872639144784 (1062, 4) (2870, 4)
0.505432653187008 (179, 11) (479, 11)
7.485568667825707 (35, 6) (92, 6)
TP_mean in valid set = 28.461050224975992


In [23]:
# Convert to .txt file
for i in range(1,8,1):
    PATH = "GRAPE/uci/raw_data"
    os.makedirs(f'{PATH}/data{i}/data', exist_ok=True)
    data = globals()[f'train{i}'].copy()
    data.fillna(value = 0, inplace=True)
    data.to_csv(f'{PATH}/data{i}/data/data.txt', header=None, index=None, sep=' ', mode='a')

In [None]:
'''
opt : adam, sgd, rmsprop, adagrad; opt_scheduler : step, cos
'''
!python GRAPE/train_mdi.py --epochs 200 --opt_scheduler cos --opt_decay_step 50 --opt_decay_rate 0.9 --weight_decay 1e-5 \
 --valid 0.1 --save_model --save_prediction uci --data data1

In [None]:
!python GRAPE/train_y.py --epochs 200 --opt_scheduler cos --opt_decay_step 50 --opt_decay_rate 0.9 --weight_decay 1e-5 \
--valid 0.1 uci --data data1

In [27]:
# 檢視 Feature imputation 訓練數據
import pickle
fr = open('uci/test/data1/0/result.pkl', 'rb')
df = pickle.load(fr)

# print('Items of result.pkl: ', df.keys())
# print('\nargs: ', df['args'])
print('\noutputs: ', df['outputs'])
# print('\ncurves: ', df['curves'])
# print('\nlr: ', df['lr'])


outputs:  {'best_valid_rmse_pred_test': array([0.23472586, 0.3429135 , 0.49465045, ..., 0.48028755, 0.48039582,
       0.4804244 ], dtype=float32), 'best_valid_l1_pred_test': array([0.08135975, 0.3113982 , 0.68225396, ..., 0.58054477, 0.6309263 ,
       0.6309227 ], dtype=float32), 'final_pred_train': array([0.17951287, 0.0942234 , 0.3507589 , ..., 0.28867045, 0.20371333,
       0.6800189 ], dtype=float32), 'label_train': array([0.        , 0.        , 0.        , ..., 0.39130434, 0.        ,
       0.7671771 ], dtype=float32), 'final_pred_test': array([0.07680242, 0.29278088, 0.6853763 , ..., 0.5381242 , 0.627933  ,
       0.6432092 ], dtype=float32), 'label_test': array([0.       , 0.       , 0.       , ..., 0.7877193, 0.8798246,
       0.7650443], dtype=float32)}


In [36]:
print(df['outputs'].keys())
for key in df['outputs'].keys():
    print("{} ".format(df['outputs'][key].shape), end='')

dict_keys(['best_valid_rmse_pred_test', 'best_valid_l1_pred_test', 'final_pred_train', 'label_train', 'final_pred_test', 'label_test'])
(1547,) (1547,) (3396,) (3396,) (1547,) (1547,) 

In [None]:
# 檢視 Label prediction 訓練數據
fr = open('uci/test/data1/y0/result.pkl', 'rb')
df = pickle.load(fr)

# print('Items of result.pkl: ', df.keys())
# print('\nargs: ', df['args'])
print('\noutputs: ', df['outputs'])
# print('\ncurves: ', df['curves'])
# print('\nlr: ', df['lr'])

In [38]:
print(df['outputs'].keys())
for key in df['outputs'].keys():
    print("{} ".format(df['outputs'][key].shape), end='')

dict_keys(['pred_train', 'label_train', 'pred_test', 'label_test'])
(454,) (454,) (242,) (242,) 