In [32]:
import random
import pandas as pd
import numpy as np
import os
import glob
import tensorflow as tf
from tensorflow.data import Dataset
import tensorflow.keras as keras
from tqdm.auto import tqdm
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib.ticker import LinearLocator
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor 
datapath='./drive/MyDrive/project'

In [2]:
all_input_list = sorted(glob.glob(datapath+'/train_input/*.csv'))
all_target_list = sorted(glob.glob(datapath+'/train_target/*.csv'))
train_input_list = all_input_list[:25]
train_target_list = all_target_list[:25]
val_input_list = all_input_list[25:]
val_target_list = all_target_list[25:]

<h2>데이터 fetch, Min-Max scaling, concat </h2>

In [38]:

def CustomDataset(input_paths, target_paths,tradition=False):
  data_list = []
  label_list = []
  input_data=pd.DataFrame()
  target_data=pd.DataFrame()
  print('Data Pre-processing..')
  for input_path, target_path in tqdm(zip(input_paths, target_paths)):
    input_df = pd.read_csv(input_path)
    target_df = pd.read_csv(target_path)
    input_df = input_df.drop(columns=['obs_time'])          
    input_length = int(len(input_df)/24)
    target_length = int(len(target_df))
    for idx in range(target_length):
      temp = input_df[24*idx:24*(idx+1)]
      for i in [6,8,10,12,14]:
        if temp.iloc[23][i]==0.0:
          continue
        else:
          temp.iloc[:,i]=temp.iloc[:,i]/temp.iloc[23][i]
      input_df[24*idx:24*(idx+1)]=temp
    input_data=pd.concat([input_data,input_df])
    target_data=pd.concat([target_data,target_df])
  print(input_data.shape)
  min=np.array(input_data.min()).reshape(1,-1)
  max=np.array(input_data.max()).reshape(1,-1)
  for input_path, target_path in tqdm(zip(input_paths, target_paths)):
    input_df = pd.read_csv(input_path)
    target_df = pd.read_csv(target_path)
    input_df = input_df.drop(columns=['obs_time'])          
    input_length = int(len(input_df)/24)
    target_length = int(len(target_df))
    for idx in range(target_length):
      time_series = input_df[24*idx:24*(idx+1)].values
      for i in [6,8,10,12,14]:
        if time_series[-1][i]==0.0:
          continue
        else:
          time_series[:,i]=time_series[:,i]/time_series[-1][i]
      data_list.append(tf.convert_to_tensor((time_series-min)/(max-min)))
    for label in target_df["predicted_weight_g"]:
      label_list.append(label)
  if tradition:
    return ((input_data-min)/(max-min)),target_data
  else:
    return data_list,label_list

  

In [None]:
#input으로 target의 train의 path, target의 path를 문자열 리스트 형태로 넣습니다.
data,target=CustomDataset(train_input_list,train_target_list)

In [None]:
tf.convert_to_tensor(data).shape #하루치 데이터를 2-D 형태로 넣고, 이이 2D 텐서가 700개 있습니다. 

TensorShape([700, 24, 15])

In [9]:
#traditional regression을 위해서 데이터를 하루치로 바꾸었다 
def convert_for_Dense(data_list):
  data=np.zeros((1,360))
  for tensor in data_list:
    temp=np.array([])
    for time in tensor:
      temp=np.hstack([temp,time])
    data=np.vstack([data,temp])
  data=data[1:]
  index = data[:,0]==0
  nums=[0]
  b=0
  for i in range(1,len(index)):
    if index[i]==True:
      b=b+1
      nums.append(b)
    else:
      nums.append(b)
  nums = np.array(nums).reshape(-1,1)
  data=np.hstack([data,nums])
  return data
data_for_dense = convert_for_Dense(data)[:,:360]

<h1>grid search를 통한 성능측정</h1>
<h3>RadnomForest</h3?

In [53]:
# RandomForest, Lasso, GradientBoosting, AdaBoosting
param_rf = {'n_estimators':[50,100,150,200],'n_jobs':[-1],'max_depth':[1,2,3,4]}
grid_rf = GridSearchCV(RandomForestRegressor(),param_grid=param_rf,cv=5,scoring='r2')
grid_rf.fit(data_for_dense,target)

<h3>LASSO</h3>

In [None]:
param_lasso = {'alpha':[0,0.001,0.01,0.05,0.1,0.5,1,5,10]}
grid_lasso = GridSearchCV(Lasso(),param_grid=param_lasso,cv=5,scoring='r2')
grid_lasso.fit(data_for_dense,target)

<h3>AdaBoost</h3>

In [71]:
param_ada = {'n_estimators':[50,100,150],'learning_rate':[0.01,0.05,0.1,0.5,1,2]}
grid_ada = GridSearchCV(AdaBoostRegressor(),param_grid=param_ada,cv=5,scoring='r2')
grid_ada.fit(data_for_dense,target)

0.6103288922657

<h3>GradientBoost</h3>

In [73]:
param_gdr = {'n_estimators':[50,100,150],'learning_rate':[0.01,0.05,0.1,0.5,1,2],'max_depth':[1,2,3]}
grid_gdr = GridSearchCV(GradientBoostingRegressor(),param_grid=param_gdr,cv=5,scoring='r2')
grid_gdr.fit(data_for_dense,target)

0.5802909304075492

<h3>conclusion</h3>

In [76]:
for model, name in zip([grid_lasso,grid_rf,grid_ada,grid_gdr],['LASSO','RandomForest','Adaboost','GradientBoost']):
  print('----------------------------------------')
  print('model:',name)
  print('model_parameters:',model.param_grid)
  print('best_score:',model.best_score_)
  print('best_estimator:',model.best_estimator_)
  print('----------------------------------------')

----------------------------------------
model: LASSO
model_parameters: {'alpha': [0, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
best_score: 0.5068933730226999
best_estimator: Lasso(alpha=0.05)
----------------------------------------
----------------------------------------
model: RandomForest
model_parameters: {'n_estimators': [50, 100, 150, 200], 'n_jobs': [-1], 'max_depth': [1, 2, 3, 4]}
best_score: 0.5824829315169876
best_estimator: RandomForestRegressor(max_depth=4, n_estimators=50, n_jobs=-1)
----------------------------------------
----------------------------------------
model: Adaboost
model_parameters: {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1, 0.5, 1, 2]}
best_score: 0.6103288922657
best_estimator: AdaBoostRegressor(learning_rate=0.1, n_estimators=150)
----------------------------------------
----------------------------------------
model: GradientBoost
model_parameters: {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1, 0.5, 1, 2], 'm