## Ensemble Learning
method: bagging & stacking

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier
from scipy import stats
from sklearn import preprocessing

In [2]:
def sample(data, size):
    if(size > data.shape[0]):
        print("size greater than the data size!")
        size = data.shape[0]
    idx = random.sample(range(0, data.shape[0]), size)
    return data.iloc[idx, :], idx

def category2num(data):
    num2Cat = {}
    for x in data.columns:
        if data[x].dtype != 'float' and data[x].dtype != 'int64':
            num = preprocessing.LabelEncoder()
            num.fit(data[x].astype(str))
            data[x] = num.transform(data[x].astype(str))
            num2Cat[x] = num
    return data, num2Cat

In [3]:
# simple stacking
class ensembleLearning:
    def __init__(self):
        self.model = []
        pass
    
    def trainForSub(self, feature, label):
        model = DecisionTreeClassifier(criterion = 'entropy')
        model.fit(feature, label)
        self.model.append(model)
        
    def intergrate(self, feature, label):
        result = []
        for m in self.model:
            result.append(m.predict(feature))
        result = np.array(result).transpose()

        model = DecisionTreeClassifier(criterion = 'entropy')
        new_feature = category2num(pd.DataFrame(result[:, :-1]))[0]
        new_label = np.array(result[:, -1])
        model.fit(new_feature, new_label)
        self.sec_model = model    
    
    def predict(self, feature):
        result = []
        for m in self.model:
            result.append(m.predict(feature))
        result = np.array(result).transpose()
        return result
    
    def bagging(self, feature):
        models_result = self.predict(feature)
        print(models_result.shape)
        result = [pd.Series.mode(x)[0] for x in models_result]
        return result

        
    def stacking(self, train_data, feature):
        self.intergrate(train_data.iloc[:, :-1], train_data.iloc[:, -1])
        result = self.predict(feature)
        new_feature = category2num(pd.DataFrame(result[:, :-1]))[0]
        result = self.sec_model.predict(new_feature)   
        return result
        

In [4]:
# revise the file route if necessary
file = "../data/iris.csv"
data = pd.read_csv(file, header = 0)
train_data, idx = sample(data, round(data.shape[0] * 0.7))
test_data = data.drop(idx)

In [5]:
EL = ensembleLearning()

sub_model_num = 10
train_size = round(train_data.shape[0] * 0.3)

for i in range(sub_model_num):
    sample_data = sample(train_data, train_size)
    sample_feature = train_data.iloc[:, :-1]
    sample_label = train_data.iloc[:, -1]
    EL.trainForSub(sample_feature, sample_label)
EL.intergrate(train_data.iloc[:, :-1], train_data.iloc[:, -1])

## bagging

In [6]:
bagging_result = EL.bagging(test_data.iloc[:, :-1])

(45, 10)


In [7]:
check = np.where(bagging_result == test_data.iloc[:, -1])
print(len(check[0])/test_data.shape[0])

0.9555555555555556


## stacking

In [8]:
stacking_result = EL.stacking(train_data, test_data.iloc[:, :-1])

In [9]:
check = np.where(stacking_result == test_data.iloc[:, -1])
print(len(check[0])/test_data.shape[0])

0.9555555555555556


## test on the sigle one model
***model type: decision tree (with entropy as criterion)***

In [10]:
model = DecisionTreeClassifier(criterion = 'entropy')
model.fit(train_data.iloc[:, :-1], train_data.iloc[:, -1])
r = model.predict(test_data.iloc[:, :-1])
check = np.where(r == test_data.iloc[:, -1])
print(len(check[0])/test_data.shape[0])

0.9555555555555556
