In [119]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection  import train_test_split
import numpy as np
from tqdm import tqdm

In [107]:
breast_cancer = load_breast_cancer()

In [110]:
X = breast_cancer.data
y = breast_cancer.target
y[y==0] = -1

In [111]:
# 划分数据
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape,X_test.shape)

(426, 30) (143, 30)


In [64]:
# 构造弱分类器的决策函数g(X)
def _G(fi,fv,direct):
    assert direct in ["positive","nagetive"]
    def _g(X):
        if direct  == "positive":
            predict = (X[:,fi] <= fv) * -1 # which <= value assign -1 else 0
        else:
            predict = (X[:,fi] > fv) * -1 # which > value assign 0 else -1
        predict[predict == 0] = 1
        return predict 
    return _g

In [65]:
#验证是否有效
func = _G(0,5,"positive")
print(func(X_test))
result = np.zeros_like(X_test[:,0],dtype=np.int32)
result[X_test[:,0]<=5] = -1
result[X_test[:,0]>5] = 1
print(result)

[ 1 -1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1 -1  1  1
 -1]
[ 1 -1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1 -1  1  1
 -1]


In [39]:
arr = X_test[:,0]
index = np.argsort(arr)
predict = np.zeros_like(arr,dtype=np.int32)
# direct = postive
predict[arr <= 5] = -1
predict[arr > 5] = 1
# direct = nagetive
predict[arr <= 5] = 1
predict[arr > 5] = -1

[5.8 5.1 5.7 5.4 5.7 7.  5.7 4.5 5.  5.3 6.9 5.2 4.9 4.7 4.9 4.8 6.4 5.
 5.1 5.1 4.4 5.  5.2 5.7 4.6]
[20  7 24 13 15 14 12  8 21 17 18 19  1 22 11  9  3  2 23  4  6  0 16 10
  5]


In [100]:
#选择最佳的划分点,即求出fi和fv
def best_split(X,y,w):
    best_err = 1e10
    best_fi = None
    best_fv = None
    best_direct = None
    for fi in range(X.shape[1]):
        series = X[:,fi]
        for fv in np.sort(series):
            predict = np.zeros_like(arr,dtype=np.int32)
            # direct = postive
            predict[arr <= fv] = -1
            predict[arr > fv] = 1
            err = np.sum((predict != y)* 1 * w)
            print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"postive"))
            if err < best_err:
                best_err = err
                best_fi = fi
                best_fv = fv
                best_direct = "positive"
            
            # direct = nagetive
            predict = predict * -1
            err = np.sum((predict != y) * 1 * w)
            if err < best_err:
                best_err = err
                best_fi = fi
                best_fv = fv
                best_direct = "nagetive"
            print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"nagetive"))
    return best_err,best_fi,best_fv

In [63]:
w = np.ones_like(y_test) / len(y_test)
best_split(X_test,y_test,w)

err = 0.39999999999999997 ,fi=0,fv=4.4,direct=postive
err = 0.6000000000000001 ,fi=0,fv=4.4,direct=nagetive
err = 0.4 ,fi=0,fv=4.6,direct=postive
err = 0.6000000000000001 ,fi=0,fv=4.6,direct=nagetive
err = 0.36 ,fi=0,fv=4.7,direct=postive
err = 0.6400000000000001 ,fi=0,fv=4.7,direct=nagetive
err = 0.36 ,fi=0,fv=4.7,direct=postive
err = 0.6400000000000001 ,fi=0,fv=4.7,direct=nagetive
err = 0.4 ,fi=0,fv=4.8,direct=postive
err = 0.6000000000000001 ,fi=0,fv=4.8,direct=nagetive
err = 0.48 ,fi=0,fv=5.1,direct=postive
err = 0.52 ,fi=0,fv=5.1,direct=nagetive
err = 0.48 ,fi=0,fv=5.1,direct=postive
err = 0.52 ,fi=0,fv=5.1,direct=nagetive
err = 0.48 ,fi=0,fv=5.1,direct=postive
err = 0.52 ,fi=0,fv=5.1,direct=nagetive
err = 0.56 ,fi=0,fv=5.4,direct=postive
err = 0.44 ,fi=0,fv=5.4,direct=nagetive
err = 0.56 ,fi=0,fv=5.5,direct=postive
err = 0.44 ,fi=0,fv=5.5,direct=nagetive
err = 0.56 ,fi=0,fv=5.5,direct=postive
err = 0.44 ,fi=0,fv=5.5,direct=nagetive
err = 0.56 ,fi=0,fv=5.5,direct=postive
err = 0.4

(0.36, 0, 4.7)

In [144]:
# 构建Adaboost类
class MyAdaboost:
    def __init__(self,n_estimators):
        self.n_estimators = n_estimators
        self.clfs = []
        self.alphas = []
        self.weights = None
        
    # 构造弱分类器的决策函数g(X)
    def _G(self,fi,fv,direct):
        assert direct in ["positive","nagetive"]
        def _g(X):
            if direct  == "positive":
                predict = (X[:,fi] <= fv) * -1 # which <= value assign -1 else 0
            else:
                predict = (X[:,fi] > fv) * -1 # which > value assign 0 else -1
            predict[predict == 0] = 1
            return predict 
        return _g
    
    #选择最佳的划分点,即求出fi和fv
    def _best_split(self,X,y,w):
        best_err = 1e10
        best_fi = None
        best_fv = None
        best_direct = None
        for fi in range(X.shape[1]):
            series = X[:,fi]
            for fv in np.sort(series):
                predict = np.zeros_like(series,dtype=np.int32)
                # direct = postive
                predict[series <= fv] = -1
                predict[series > fv] = 1
                err = np.sum((predict != y)* 1 * w)
#                 print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"postive"))
                if err < best_err:
                    best_err = err
                    best_fi = fi
                    best_fv = fv
                    best_direct = "positive"

                # direct = nagetive
                predict = predict * -1
                err = np.sum((predict != y) * 1 * w)
                if err < best_err:
                    best_err = err
                    best_fi = fi
                    best_fv = fv
                    best_direct = "nagetive"
#                 print("err = {} ,fi={},fv={},direct={}".format(err,fi,fv,"nagetive"))
        return best_err,best_fi,best_fv,best_direct
    
    def fit(self,X_train,y_train):
        self.weights = np.ones_like(y_train) / len(y_train)
        for i in tqdm(range(self.n_estimators)):
            err,fi,fv,direct = self._best_split(X_train,y_train,self.weights)
#             print(i,err,fi,fv,direct)
            
            #计算G(x)的系数alpha
            alpha = 0.5 * np.log((1-err)/err)
#             print("alpha:",alpha)
            self.alphas.append(alpha)
            
            #求出G
            g = self._G(fi,fv,direct)
            self.clfs.append(g)
            
            #更新weights
            self.weights = self.weights * np.exp(-1 * alpha * y_train * g(X_train))
            self.weights = self.weights / np.sum(self.weights)
#             print("weights :",self.weights)
    
    def predict(self,X_test):
        y_p = np.array([self.alphas[i] * self.clfs[i](X_test) for i in range(self.n_estimators)])
        y_p = np.sum(y_p,axis=0)
        y_predict = np.zeros_like(y_p,dtype=np.int32)
        y_predict[y_p>=0] = 1
        y_predict[y_p<0] = -1
        return y_predict
    
    def score(self,X_test,y_test):
        y_predict = self.predict(X_test)
        return np.sum(y_predict == y_test)/len(y_predict)

In [None]:
clf = MyAdaboost(100)
clf.fit(X_train,y_train)

 15%|██            | 15/100 [00:10<01:02,  1.37it/s]

In [142]:
clf.alphas

[1.2066797492479762,
 0.9550334856259718,
 0.7280236332199983,
 0.574744258124658,
 0.5305756803245693,
 0.5488124453500542,
 0.5164893968884653,
 0.7115468708060274,
 0.5488704763454592,
 0.46235040922771986,
 0.4883250757386486,
 0.5061749527598064,
 0.4240888782159666,
 0.37139973176105384,
 0.43847670325361054,
 0.3689905768000123,
 0.5068965593580707,
 0.43674441980251866,
 0.4399053806155735,
 0.43306302771093685,
 0.44886654167747586,
 0.4007204086507429,
 0.4722227450410171,
 0.35923642958961316,
 0.3633716916478892,
 0.40126177773996724,
 0.37663499088442176,
 0.36401475830098734,
 0.38712120079271817,
 0.39093350186253584,
 0.4186782807358294,
 0.447724528818573,
 0.4312228161738305,
 0.3465552872834859,
 0.39104986965625116,
 0.3416801285781792,
 0.4031125271095423,
 0.37382776743813634,
 0.4874404750589538,
 0.3606102636298511,
 0.3875307104547722,
 0.3313073088126768,
 0.4193157028932447,
 0.4113822401339551,
 0.38751171515359295,
 0.3778569230619695,
 0.39922644445777233,

In [143]:
clf.score(X_test,y_test)

array([0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.        , 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.        , 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.        , 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699301,
       0.00699301, 0.00699301, 0.00699301, 0.00699301, 0.00699