In [1]:
# Big Data 학습
# : 메모리 등의 문제로 다음과 같은 모형을 주로 사용함
# - 사전 확률분포를 설정할 수 있는 모형
# - 시작 가중치를 설정할 수 있는 모형

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from matplotlib import rc
import matplotlib as mpl
import warnings
warnings.filterwarnings(action="ignore")
rc('font', family="AppleGothic")
%matplotlib inline

### Get data 
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

covtype = fetch_covtype(shuffle=True, random_state=0)
X_covtype = covtype.data
y_covtype = covtype.target - 1
classes = np.unique(y_covtype)
X_train, X_test, y_train, y_test = train_test_split(X_covtype, y_covtype) 
# train:test = 0.75:0.25 (default)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train) # cal. mean and std and transform
X_test = scaler.transform(X_test) # transform

def read_Xy(start, end):
    idx = list(range(start, min(len(y_train)-1, end)))
    X = X_train[idx,:]
    y = y_train[idx]
    return X, y

In [2]:
### SGD
### : 일부데이터를 사용하여 초기 가중치 계산 가능
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

model = SGDClassifier(random_state=0)
n_split = 10
n_X = len(y_train) // n_split
n_epoch = 10
for epoch in range(n_epoch):
    for n in range(n_split):
        X, y = read_Xy(n*n_X, (n+1)*n_X)
        model.partial_fit(X, y, classes=classes)
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test  = accuracy_score(y_test, model.predict(X_test))
    print("epoch={:d}, train acc.={:5.3f}, test acc.={:5.3f}".format(epoch, accuracy_train, accuracy_test))
%time

epoch=0, train acc.=0.706, test acc.=0.707
epoch=1, train acc.=0.709, test acc.=0.709
epoch=2, train acc.=0.710, test acc.=0.710
epoch=3, train acc.=0.712, test acc.=0.713
epoch=4, train acc.=0.712, test acc.=0.712
epoch=5, train acc.=0.712, test acc.=0.712
epoch=6, train acc.=0.712, test acc.=0.712
epoch=7, train acc.=0.712, test acc.=0.712
epoch=8, train acc.=0.712, test acc.=0.712
epoch=9, train acc.=0.712, test acc.=0.712
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


In [3]:
### Naive Bayes
### : 일부 데이터를 이용하여 구한 확률분포를 사전확률분포로 사용 가능
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

model = BernoulliNB()
n_split = 10
n_X = len(y_train) // n_split
for n in range(n_split):
    X, y = read_Xy(n*n_X, (n+1)*n_X)
    model.partial_fit(X, y, classes=classes)
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test  = accuracy_score(y_test, model.predict(X_test))
    print("n={:d}, train acc.={:5.3f}, test acc.={:5.3f}".format(n, accuracy_train, accuracy_test))
%time

n=0, train acc.=0.630, test acc.=0.629
n=1, train acc.=0.633, test acc.=0.632
n=2, train acc.=0.632, test acc.=0.631
n=3, train acc.=0.633, test acc.=0.632
n=4, train acc.=0.633, test acc.=0.632
n=5, train acc.=0.634, test acc.=0.632
n=6, train acc.=0.634, test acc.=0.632
n=7, train acc.=0.633, test acc.=0.632
n=8, train acc.=0.632, test acc.=0.631
n=9, train acc.=0.633, test acc.=0.632
CPU times: user 11 µs, sys: 2 µs, total: 13 µs
Wall time: 11 µs


In [4]:
### Gradient Boosting
### : 초기 커미티 멤버로 일부 데이터를 사용하여 학습한 모형을 사용할 수 있음
from lightgbm import train, Dataset
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

params = {
    'objective': 'multiclass', 
    'num_class': len(classes),
    'learning_rate': 0.2, 
    'seed':0,    
}

n_split = 10
n_X = len(y_train) // n_split
num_tree = 10
model = None
for n in range(n_split):
    X, y = read_Xy(n*n_X, (n+1)*n_X)
    model = train(params, init_model=model, train_set=Dataset(X,y),
                  keep_training_booster=False, num_boost_round=num_tree)
    accuracy_train = accuracy_score(y_train, np.argmax(model.predict(X_train), axis=1))
    accuracy_test  = accuracy_score(y_test, np.argmax(model.predict(X_test), axis=1))
    print("n={:d}, train acc.={:5.3f}, test acc.={:5.3f}".format(n, accuracy_train, accuracy_test))
%time

n=0, train acc.=0.776, test acc.=0.774
n=1, train acc.=0.796, test acc.=0.792
n=2, train acc.=0.812, test acc.=0.809
n=3, train acc.=0.827, test acc.=0.823
n=4, train acc.=0.835, test acc.=0.830
n=5, train acc.=0.844, test acc.=0.839
n=6, train acc.=0.848, test acc.=0.842
n=7, train acc.=0.854, test acc.=0.847
n=8, train acc.=0.858, test acc.=0.851
n=9, train acc.=0.847, test acc.=0.840
CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 10 µs


In [6]:
### Random Forest
### : Ensemble 계열에서는 일부 데이터를 사용한 모형을 개별 분류기
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

n_split = 10
n_X = len(y_train) // n_split
num_tree_init = 10
num_tree_step = 10
model = RandomForestClassifier(n_estimators=num_tree_init, warm_start=True)
for n in range(n_split):
    X, y = read_Xy(n*n_X, (n+1)*n_X)
    model.fit(X, y)
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test = accuracy_score(y_test, model.predict(X_test))
    print("epoch={:d} train accuracy={:5.3f} test accuracy={:5.3f}".format(n, accuracy_train, accuracy_test))
    
    model.n_estimators += num_tree_step
%time
    

epoch=0 train accuracy=0.866 test accuracy=0.852
epoch=1 train accuracy=0.890 test accuracy=0.873
epoch=2 train accuracy=0.899 test accuracy=0.880
epoch=3 train accuracy=0.902 test accuracy=0.883
epoch=4 train accuracy=0.904 test accuracy=0.885
epoch=5 train accuracy=0.906 test accuracy=0.886
epoch=6 train accuracy=0.907 test accuracy=0.887
epoch=7 train accuracy=0.907 test accuracy=0.888
epoch=8 train accuracy=0.908 test accuracy=0.888
epoch=9 train accuracy=0.909 test accuracy=0.888
CPU times: user 13 µs, sys: 1e+03 ns, total: 14 µs
Wall time: 12.9 µs
