In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection  import train_test_split
import numpy as np
from tqdm import tqdm

In [2]:
breast_cancer = load_breast_cancer()

In [3]:
X = breast_cancer.data
y = breast_cancer.target
y[y==0] = -1

In [4]:
# 划分数据
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape,X_test.shape)

(426, 30) (143, 30)


In [5]:
# 构造弱分类器的决策函数g(X)
def _G(fi,fv,direct):
    assert direct in ["positive","nagetive"]
    def _g(X):
        if direct  == "positive":
            predict = (X[:,fi] <= fv) * -1 # which <= value assign -1 else 0
        else:
            predict = (X[:,fi] > fv) * -1 # which > value assign 0 else -1
        predict[predict == 0] = 1
        return predict 
    return _g

In [6]:
#验证是否有效
func = _G(0,5,"positive")
print(func(X_test))
result = np.zeros_like(X_test[:,0],dtype=np.int32)
result[X_test[:,0]<=5] = -1
result[X_test[:,0]>5] = 1
print(result)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [7]:
arr = X_test[:,0]
index = np.argsort(arr)
predict = np.zeros_like(arr,dtype=np.int32)
# direct = postive
predict[arr <= 5] = -1
predict[arr > 5] = 1
# direct = nagetive
predict[arr <= 5] = 1
predict[arr > 5] = -1

In [8]:
#选择最佳的划分点,即求出fi和fv
def best_split(X,y,w):
    best_err = 1e10
    best_fi = None
    best_fv = None
    best_direct = None
    for fi in range(X.shape[1]):
        series = X[:,fi]
        for fv in np.sort(series):
            predict = np.zeros_like(arr,dtype=np.int32)
            # direct = postive
            predict[arr <= fv] = -1
            predict[arr > fv] = 1
            err = np.sum((predict != y)* 1 * w)
            print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"postive"))
            if err < best_err:
                best_err = err
                best_fi = fi
                best_fv = fv
                best_direct = "positive"
            
            # direct = nagetive
            predict = predict * -1
            err = np.sum((predict != y) * 1 * w)
            if err < best_err:
                best_err = err
                best_fi = fi
                best_fv = fv
                best_direct = "nagetive"
            print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"nagetive"))
    return best_err,best_fi,best_fv

In [9]:
w = np.ones_like(y_test) / len(y_test)
best_split(X_test,y_test,w)

err = 0.3916083916083916 ,fi=0,fv=7.691,direct=postive
err = 0.6083916083916083 ,fi=0,fv=7.691,direct=nagetive
err = 0.39860139860139865 ,fi=0,fv=7.729,direct=postive
err = 0.6013986013986014 ,fi=0,fv=7.729,direct=nagetive
err = 0.4055944055944056 ,fi=0,fv=8.196,direct=postive
err = 0.5944055944055944 ,fi=0,fv=8.196,direct=nagetive
err = 0.4125874125874126 ,fi=0,fv=8.597,direct=postive
err = 0.5874125874125873 ,fi=0,fv=8.597,direct=nagetive
err = 0.4195804195804196 ,fi=0,fv=8.888,direct=postive
err = 0.5804195804195803 ,fi=0,fv=8.888,direct=nagetive
err = 0.42657342657342656 ,fi=0,fv=9.0,direct=postive
err = 0.5734265734265733 ,fi=0,fv=9.0,direct=nagetive
err = 0.4335664335664336 ,fi=0,fv=9.029,direct=postive
err = 0.5664335664335663 ,fi=0,fv=9.029,direct=nagetive
err = 0.44055944055944063 ,fi=0,fv=9.042,direct=postive
err = 0.5594405594405594 ,fi=0,fv=9.042,direct=nagetive
err = 0.4475524475524476 ,fi=0,fv=9.173,direct=postive
err = 0.5524475524475524 ,fi=0,fv=9.173,direct=nagetive
er

err = 0.38461538461538464 ,fi=4,fv=0.1425,direct=postive
err = 0.6153846153846153 ,fi=4,fv=0.1425,direct=nagetive
err = 0.38461538461538464 ,fi=4,fv=0.1634,direct=postive
err = 0.6153846153846153 ,fi=4,fv=0.1634,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.03116,direct=postive
err = 0.6153846153846153 ,fi=5,fv=0.03116,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.03393,direct=postive
err = 0.6153846153846153 ,fi=5,fv=0.03393,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.03398,direct=postive
err = 0.6153846153846153 ,fi=5,fv=0.03398,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.03574,direct=postive
err = 0.6153846153846153 ,fi=5,fv=0.03574,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.03729,direct=postive
err = 0.6153846153846153 ,fi=5,fv=0.03729,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.04052,direct=postive
err = 0.6153846153846153 ,fi=5,fv=0.04052,direct=nagetive
err = 0.38461538461538464 ,fi=5,fv=0.04413,direct=postive
err = 0.6153846153

err = 0.38461538461538464 ,fi=8,fv=0.1809,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.1809,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.1812,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.1812,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.1814,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.1814,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.1816,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.1816,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.182,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.182,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.183,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.183,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.1842,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.1842,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.1842,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0.1842,direct=nagetive
err = 0.38461538461538464 ,fi=8,fv=0.1845,direct=postive
err = 0.6153846153846153 ,fi=8,fv=0

err = 0.6153846153846153 ,fi=12,fv=0.8484,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=0.9812,direct=postive
err = 0.6153846153846153 ,fi=12,fv=0.9812,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.094,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.094,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.109,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.109,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.115,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.115,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.126,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.126,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.143,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.143,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.144,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.144,direct=nagetive
err = 0.38461538461538464 ,fi=12,fv=1.231,direct=postive
err = 0.6153846153846153 ,fi=12,fv=1.231,direct=nagetive
err = 0.38461538461538464 ,f

err = 0.6153846153846153 ,fi=15,fv=0.02025,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02075,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02075,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02101,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02101,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02114,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02114,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02172,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02172,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02178,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02178,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02199,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02199,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02219,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02219,direct=nagetive
err = 0.38461538461538464 ,fi=15,fv=0.02222,direct=postive
err = 0.6153846153846153 ,fi=15,fv=0.02222,direct=nageti

err = 0.38461538461538464 ,fi=19,fv=0.002278,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002278,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.002281,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002281,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.002295,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002295,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.002299,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002299,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.00233,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.00233,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.002336,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002336,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.002365,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002365,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.002422,direct=postive
err = 0.6153846153846153 ,fi=19,fv=0.002422,direct=nagetive
err = 0.38461538461538464 ,fi=19,fv=0.0024

err = 0.6153846153846153 ,fi=22,fv=125.9,direct=postive
err = 0.38461538461538464 ,fi=22,fv=125.9,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=126.3,direct=postive
err = 0.38461538461538464 ,fi=22,fv=126.3,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=127.3,direct=postive
err = 0.38461538461538464 ,fi=22,fv=127.3,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=128.2,direct=postive
err = 0.38461538461538464 ,fi=22,fv=128.2,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=129.0,direct=postive
err = 0.38461538461538464 ,fi=22,fv=129.0,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=129.0,direct=postive
err = 0.38461538461538464 ,fi=22,fv=129.0,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=129.1,direct=postive
err = 0.38461538461538464 ,fi=22,fv=129.1,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=129.2,direct=postive
err = 0.38461538461538464 ,fi=22,fv=129.2,direct=nagetive
err = 0.6153846153846153 ,fi=22,fv=133.3,direct=postive
err = 0.38461538461538464 ,fi=22

err = 0.38461538461538464 ,fi=27,fv=0.02579,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.02579,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.02832,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.02832,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.03002,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.03002,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.03125,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.03125,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.03194,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.03194,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.03264,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.03264,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.0377,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.0377,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.03846,direct=postive
err = 0.6153846153846153 ,fi=27,fv=0.03846,direct=nagetive
err = 0.38461538461538464 ,fi=27,fv=0.03922,direct=postive

(0.09090909090909091, 0, 14.99)

In [10]:
# 构建Adaboost类
class MyAdaboost:
    def __init__(self,n_estimators):
        self.n_estimators = n_estimators
        self.clfs = [lambda x:0 for i in range(self.n_estimators)]
        self.alphas = [0 for i in range(self.n_estimators)]
        self.weights = None
        
    # 构造弱分类器的决策函数g(X)
    def _G(self,fi,fv,direct):
        assert direct in ["positive","nagetive"]
        def _g(X):
            if direct  == "positive":
                predict = (X[:,fi] <= fv) * -1 # which <= value assign -1 else 0
            else:
                predict = (X[:,fi] > fv) * -1 # which > value assign 0 else -1
            predict[predict == 0] = 1
            return predict 
        return _g
    
    #选择最佳的划分点,即求出fi和fv
    def _best_split(self,X,y,w):
        best_err = 1e10
        best_fi = None
        best_fv = None
        best_direct = None
        for fi in range(X.shape[1]):
            series = X[:,fi]
            for fv in np.sort(series):
                predict = np.zeros_like(series,dtype=np.int32)
                # direct = postive
                predict[series <= fv] = -1
                predict[series > fv] = 1
                err = np.sum((predict != y)* 1 * w)
#                 print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"postive"))
                if err < best_err:
                    best_err = err
                    best_fi = fi
                    best_fv = fv
                    best_direct = "positive"

                # direct = nagetive
                predict = predict * -1
                err = np.sum((predict != y) * 1 * w)
                if err < best_err:
                    best_err = err
                    best_fi = fi
                    best_fv = fv
                    best_direct = "nagetive"
#                 print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"nagetive"))
        return best_err,best_fi,best_fv,best_direct
    
    def fit(self,X_train,y_train):
        self.weights = np.ones_like(y_train) / len(y_train)
        for i in tqdm(range(self.n_estimators)):
            err,fi,fv,direct = self._best_split(X_train,y_train,self.weights)
#             print(i,err,fi,fv,direct)
            
            #计算G(x)的系数alpha
            alpha = 0.5 * np.log((1-err)/err) if err !=0 else 1
#             print("alpha:",alpha)
            self.alphas[i] = alpha
            
            #求出G
            g = self._G(fi,fv,direct)
            self.clfs[i] = g
            
            if err == 0: break
            
            #更新weights
            self.weights = self.weights * np.exp(-1 * alpha * y_train * g(X_train))
            self.weights = self.weights / np.sum(self.weights)
#             print("weights :",self.weights)
    
    def predict(self,X_test):
        y_p = np.array([self.alphas[i] * self.clfs[i](X_test) for i in range(self.n_estimators)])
        y_p = np.sum(y_p,axis=0)
        y_predict = np.zeros_like(y_p,dtype=np.int32)
        y_predict[y_p>=0] = 1
        y_predict[y_p<0] = -1
        return y_predict
    
    def score(self,X_test,y_test):
        y_predict = self.predict(X_test)
        return np.sum(y_predict == y_test)/len(y_predict)

In [11]:
clf = MyAdaboost(100)
clf.fit(X_train,y_train)

100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:58<00:00,  1.71it/s]


In [12]:
clf.alphas

[1.2553075032491037,
 1.00223625158219,
 0.7564121376171518,
 0.743179115525454,
 0.6646018343363124,
 0.5623362428956173,
 0.5447046422271071,
 0.4961513784824917,
 0.5116427126579808,
 0.4633005715626341,
 0.363869021998095,
 0.4461213951780325,
 0.38687293824616004,
 0.436469894115532,
 0.4068232618678668,
 0.3431695906974302,
 0.43430881874273386,
 0.46254989755861237,
 0.41176459789049147,
 0.38306726256433205,
 0.4691404180088417,
 0.4002486594548783,
 0.37137239433829505,
 0.44717432844112764,
 0.43655520309501494,
 0.4696359000429163,
 0.46145787113449255,
 0.3708026618339286,
 0.31645394125367204,
 0.36768194747909466,
 0.3651591002732761,
 0.38074049169723584,
 0.49474623012605773,
 0.4279934806377916,
 0.44965506313450665,
 0.36539886522516285,
 0.37868089451834563,
 0.3703446889301952,
 0.30581846153645176,
 0.39848796079213,
 0.4181963638691008,
 0.34565948647975164,
 0.3784020416060692,
 0.3698519900345699,
 0.2985909081952332,
 0.3504957548164121,
 0.3074836636892145,
 0

In [13]:
clf.score(X_test,y_test)

0.958041958041958