In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
%matplotlib inline

# 데이터 불러오기

In [2]:
train_features = pd.read_csv('/content/drive/MyDrive/DataAnalysis/운동 동작 분류 AI 경진대회/data/train_features.csv')
train_labels = pd.read_csv('/content/drive/MyDrive/DataAnalysis/운동 동작 분류 AI 경진대회/data/train_labels.csv')

# 데이터 전처리

간단하게 만들기 위해서 평균, 최대, 최소, 중앙 값만을 활용함

In [3]:
train_val = train_features.drop('time', axis=1).groupby('id').agg(['mean', 'min', 'max', 'median']).values

ag_cols = ['acc_x',	'acc_y',	'acc_z',	'gy_x',	'gy_y',	'gy_z']
des_cols = ['mean', 'min', 'max', 'median']
result_cols = []

for ag_col in ag_cols:
  for des_col in des_cols:
    result_cols.append(ag_col+'_'+des_col)

X = pd.DataFrame(data = train_val, columns = result_cols)
y = train_labels[['label']]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 22, shuffle = True, stratify = y)

In [5]:
train_dummy = pd.get_dummies(y_train['label'])
test_dummy = pd.get_dummies(y_test['label'])

# 개별 모델 만들기

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
import xgboost as xgb

from tqdm import tqdm_notebook

from sklearn.model_selection import StratifiedKFold

n_split = 5
skf = StratifiedKFold(n_splits = n_split, shuffle=True, random_state=22)

rf_train_proba = np.zeros(train_dummy.shape)
lgr_train_proba = np.zeros(train_dummy.shape)
lg_train_proba = np.zeros(train_dummy.shape)
xg_train_proba = np.zeros(train_dummy.shape)

rf_test_proba = np.zeros(test_dummy.shape)
lgr_test_proba = np.zeros(test_dummy.shape)
lg_test_proba = np.zeros(test_dummy.shape)
xg_test_proba = np.zeros(test_dummy.shape)

for trn_idx, val_idx in tqdm_notebook(skf.split(X_train, y_train)):
  trn_data, trn_label = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
  val_data, val_label = X_train.iloc[val_idx], y_train.iloc[val_idx]

  # 랜덤포레스트
  rf = RandomForestClassifier(random_state=22)
  rf.fit(X_train, y_train)

  rf_train_proba[val_idx] = rf.predict_proba(val_data)
  rf_test_proba += rf.predict_proba(X_test) / n_split

  # 로지스틱
  lgr = LogisticRegression(random_state=22)
  lgr.fit(X_train, y_train)

  lgr_train_proba[val_idx] = lgr.predict_proba(val_data)
  lgr_test_proba += lgr.predict_proba(X_test) / n_split

  # lightgbm
  lg = lgbm.LGBMClassifier(random_state=22)
  lg.fit(X_train, y_train)

  lg_train_proba[val_idx] = lg.predict_proba(val_data)
  lg_test_proba += lg.predict_proba(X_test) / n_split

  # xgboost
  xg = xgb.XGBClassifier(random_state=22)
  xg.fit(X_train, y_train)

  xg_train_proba[val_idx] = xg.predict_proba(val_data)
  xg_test_proba += xg.predict_proba(X_test) / n_split

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# 개별 모델 평가

In [8]:
from sklearn.metrics import log_loss

print("RandomForestClassifier        :{} ".format(log_loss(test_dummy, rf_test_proba)))
print("LogisticRegression            :{} ".format(log_loss(test_dummy, lgr_test_proba)))
print("lightgbm                      :{} ".format(log_loss(test_dummy, lg_test_proba)))
print("xgboost                       :{} ".format(log_loss(test_dummy, xg_test_proba)))

RandomForestClassifier        :2.0356720182080115 
LogisticRegression            :2.5661370099627994 
lightgbm                      :1.5651821756741704 
xgboost                       :1.1922718353375807 


성능이 좋지 않은 모델도 스택킹을 하는 것이기 때문에 스택깅을 통한 성능 향상은 어려울 것으로 판단됨

# 메타 모델 만들기

predict_proba를 사용하는 이유는 스택킹을 할때 데이터를 쌓아야 하는데

이미 예측된 값인 경우에는 쌓을 수 없기 때문에 획률 값을 사용하는 것!

In [9]:
meta_train_proba = np.concatenate([rf_train_proba, lgr_train_proba, lg_train_proba, xg_train_proba], axis=1)
meta_test_proba = np.concatenate([rf_test_proba, lgr_test_proba, lg_test_proba, xg_test_proba], axis=1)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
import xgboost as xgb

# 랜덤포레스트
rf = RandomForestClassifier(random_state=22)
rf.fit(meta_train_proba, y_train)

rf_proba = rf.predict_proba(meta_test_proba)

# 로지스틱
lgr = LogisticRegression(random_state=22)
lgr.fit(meta_train_proba, y_train)

lgr_proba = lgr.predict_proba(meta_test_proba)

# lightgbm
lg = lgbm.LGBMClassifier(random_state=22)
lg.fit(meta_train_proba, y_train)

lg_proba = lg.predict_proba(meta_test_proba)

# xgboost
xg = xgb.XGBClassifier(random_state=22)
xg.fit(meta_train_proba, y_train)

xg_proba = xg.predict_proba(meta_test_proba)

# 메타 모델 평가

In [11]:
from sklearn.metrics import log_loss

print("RandomForestClassifier        :{} ".format(log_loss(test_dummy, rf_proba)))
print("LogisticRegression            :{} ".format(log_loss(test_dummy, lgr_proba)))
print("lightgbm                      :{} ".format(log_loss(test_dummy, lg_proba)))
print("xgboost                       :{} ".format(log_loss(test_dummy, xg_proba)))

RandomForestClassifier        :5.530570709349575 
LogisticRegression            :1.62275361301259 
lightgbm                      :2.6617930803017216 
xgboost                       :2.105178712350875 


LogisticRegression은 향상되었지만 다른 모델은 성능이 더 하락했다는 것을 알 수 있음