In [6]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

import random

In [7]:
header = ['Mcg', 'Gvh', 'Alm', 'Mit', 'Erl', 'Pox', 'Vac', 'Nuc', 'class']
df = pd.read_csv('data/yeast-2_vs_4.dat', names=header, skiprows=13)
print(df.head())
df['class'] = df['class'].apply(lambda x:0 if x=='negative' else 1)

df_np = df.to_numpy()
x = df_np[:,:-1]
y = df_np[:,-1]

print('Dataset X shape:', x.shape)
print('Dataset Y shape:', y.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)

    Mcg   Gvh   Alm   Mit  Erl  Pox   Vac   Nuc     class
0  0.51  0.40  0.56  0.17  0.5  0.5  0.49  0.22  negative
1  0.40  0.39  0.60  0.15  0.5  0.0  0.58  0.30  negative
2  0.40  0.42  0.57  0.35  0.5  0.0  0.53  0.25  negative
3  0.46  0.44  0.52  0.11  0.5  0.0  0.50  0.22  negative
4  0.47  0.39  0.50  0.11  0.5  0.0  0.49  0.40  negative
Dataset X shape: (514, 8)
Dataset Y shape: (514,)
x_train shape: (462, 8)
y_train shape: (462,)
x_test shape: (52, 8)
y_test shape: (52,)


In [8]:
class RUSBoost:
    def __init__(self, x, y, n_classifier, base=None, weights=None):
        """
        Initialize RUSBoost
        
        :param x: input feauture in shape of (samples, features)
        :param y: input label in shape of (samples, )
        :param base: base classifier (default Decision Tree)
        :param n_classifier: number of base classifier in ensemble
        :param weights: init model with pretrained weights
        
        :return: A RUSBoost model
        """
        self.x = x
        self.y = y
        self.base = base
        if self.base is None:
            self.base = DecisionTreeClassifier()
        self.n_classifier = n_classifier
        self.classifiers = []
        self.weights = weights
        self.alpha = []
        
        # init ensemble
        for n in range(self.n_classifier):
            self.classifiers.append(self.base)
        
        if self.weights is None:
            # init weights using uniform distrobution
            self.weights = np.ones((len(self.x))) / len(self.x)
            
    def predict(self, x):
        """
        Predict the class of given instance
        
        :param x: input feauture in shape of (samples, features)
        
        :return: a prediction of classes in label encoded form with shape of (samples, )
        """
        
        prediction = np.zeros((len(x),))
        for idx in range(len(x)):
            prediction[idx] = self.__predict_single_instance(x[idx].reshape(1, -1))
        return prediction
            
    def __predict_single_instance(self, x):
        """
        Predict the class of given instance
        
        :param x: input feauture in shape of (1, features)
        
        :return: a prediction of classes in label encoded form with shape of (1, )
        """
        p = np.zeros((1, 2))
        for n in range(self.n_classifier):
            if self.classifiers[n].predict(x) == 1:
                p[0,1] += np.log(1 / self.alpha[n])
            else:
                p[0,0] += np.log(1 / self.alpha[n])
        eq_idx = (p[:,0] == p[:,1]).nonzero()[0]
        p[eq_idx,self.minor] += 0.1
        return np.argmax(p, axis=1)
    
    
    def fit(self):
        """
        Train the ensemble using RUS data balancing and base weak classifiers
        """
        for t in range(self.n_classifier):            
            # random under sampling
            rus_idx = self.__undersample()

            # training weak classifier
            self.classifiers[t].fit(self.x[rus_idx], self.y[rus_idx], self.weights[rus_idx])
            
            # calculating loss = sum of missclassified weights
            
            miss_w = self.weights[(self.classifiers[t].predict(self.x) != self.y).nonzero()[0]]
            loss = np.sum(miss_w) / 2 
            
            # calculating beta
            self.alpha.append(loss / (1 - loss))
            
            # update weights
            
            correct_pred_idx = (self.classifiers[t].predict(self.x) == self.y).nonzero()[0]
            self.weights = self.weights * self.alpha[t]
            
            # normalize weights
            z = np.sum(self.weights)
            self.weights = np.array([w / z for w in self.weights])
             
    
    def score(self, x, y):
        p = self.predict(x)
        return (p == y).nonzero()[0].__len__() / len(y)
           
    def __undersample(self):
        """
        Generates a random unique subset of majority data as same size as minority and return the indices
        
        :return: A sorted list of indices with shape of (2*minority_data, )
        """
        pos_size = len((self.y==1).nonzero()[0])
        neg_size = len((self.y==0).nonzero()[0])
        pos_data = self.x[self.y==1]
        neg_data = self.x[self.y==0]
        
        if pos_size > neg_size:
            self.major_data = pos_data
            self.minor_data = neg_data
            self.minor = 0
        else:
            self.minor_data = pos_data
            self.major_data = neg_data
            self.minor = 1
        # getting index of sampled intances for enabling correct weight update
        minor_idx = (self.y == self.minor).nonzero()[0]
        major_idx = (self.y == int(not self.minor)).nonzero()[0]
        major_idx = np.array(random.sample(list(major_idx), len(self.minor_data)))
        return sorted(np.concatenate((minor_idx, major_idx)))
        
    

In [194]:
model = RUSBoost(x=x_train, y=y_train, n_classifier=400, base=SVC(gamma='scale'))
model.fit()
model.score(x_test, y_test)

0.9423076923076923

In [200]:
from sklearn.ensemble import AdaBoostClassifier

m = AdaBoostClassifier(base_estimator=SVC(gamma='scale'),n_estimators=30, algorithm='SAMME')
m = m.fit(x_train, y_train)
m.score(x_test, y_test)

0.84615384615384615

In [198]:
from sklearn.ensemble import AdaBoostClassifier

m = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=30, algorithm='SAMME')
m = m.fit(x_train, y_train)
m.score(x_test, y_test)

0.92307692307692313

In [11]:
class AdaBoostM2:
    def __init__(self, x, y, n_classifier, base=None, weights=None):
        """
        Initialize AdaBoost M2 (Weight init is same as M1)
        
        :param x: input feauture in shape of (samples, features)
        :param y: input label in shape of (samples, )
        :param base: base classifier (default Decision Tree)
        :param n_classifier: number of base classifier in ensemble
        :param weights: init model with pretrained weights
        
        :return: A AdaBoost model
        """
        self.x = x
        self.y = y
        self.base = base
        if self.base is None:
            self.base = DecisionTreeClassifier()
        self.n_classifier = n_classifier
        self.classifiers = []
        self.weights = weights
        self.alpha = []
        
        # init ensemble
        for n in range(self.n_classifier):
            self.classifiers.append(self.base)
        
        if self.weights is None:
            # init weights using uniform distrobution
            self.weights = np.ones((len(self.x))) / len(self.x)
            
    def predict(self, x):
        """
        Predict the class of given instance
        
        :param x: input feauture in shape of (samples, features)
        
        :return: a prediction of classes in label encoded form with shape of (samples, )
        """
        
        prediction = np.zeros((len(x),))
        for idx in range(len(x)):
            prediction[idx] = self.__predict_single_instance(x[idx].reshape(1, -1))
        return prediction
            
    def __predict_single_instance(self, x):
        """
        Predict the class of given instance
        
        :param x: input feauture in shape of (1, features)
        
        :return: a prediction of classes in label encoded form with shape of (1, )
        """
        p = np.zeros((1, 2))
        for n in range(self.n_classifier):
            if self.classifiers[n].predict(x) == 1:
                p[0,1] += np.log(1 / self.alpha[n])
            else:
                p[0,0] += np.log(1 / self.alpha[n])
        return np.argmax(p, axis=1)
    
    
    def fit(self):
        """
        Train the ensemble using base weak classifiers
        """
        for t in range(self.n_classifier):            
            
            # training weak classifier
            self.classifiers[t].fit(self.x, self.y, self.weights)
            
            # calculating loss = sum of missclassified weights
            miss_w = self.weights[(self.classifiers[t].predict(self.x) != self.y).nonzero()[0]]
            loss = np.sum(miss_w) / 2 
            
            # calculating beta
            self.alpha.append(loss / (1 - loss))
            
            # update weights
            
            correct_pred_idx = (self.classifiers[t].predict(self.x) == self.y).nonzero()[0]
            self.weights = self.weights * self.alpha[t]
            
            # normalize weights
            z = np.sum(self.weights)
            self.weights = np.array([w / z for w in self.weights])
             
    
    def score(self, x, y):
        p = self.predict(x)
        return (p == y).nonzero()[0].__len__() / len(y)   
    

In [12]:
model = AdaBoostM2(x=x_train, y=y_train, n_classifier=30, base=SVC(gamma='scale'))
model.fit()
model.score(x_test, y_test)

0.9038461538461539