In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)
print(X.shape)
print(y.shape)

(1000, 4)
(1000,)


In [7]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)  

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=0)

In [8]:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
        learning_rate=1.0, n_estimators=100, random_state=0)
print(clf.feature_importances_)
print(clf.predict([[0, 0, 0, 0]]))
print(clf.score(X, y))

[0.28 0.42 0.14 0.16]
[1]
0.983


## finetune特征

In [11]:
import numpy as np

train_feature=np.load('/home/yangtianyun/MM_course/ft-feature/trainall_feature.npy')
test_feature=np.load('/home/yangtianyun/MM_course/ft-feature/test_feature.npy')
train_label=np.load('/home/yangtianyun/MM_course/ft-feature/trainall_label.npy')
test_label=np.load('/home/yangtianyun/MM_course/ft-feature/test_label.npy')

In [12]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(train_feature, train_label)  

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=0)

In [16]:
prediction = clf.predict(train_feature)

In [17]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
print(accuracy_score(train_label, prediction))
print(classification_report(train_label, prediction, labels=[0.0, 1.0], target_names=['nonrumor', 'rumor'],digits=4))

0.9805885681946802
              precision    recall  f1-score   support

    nonrumor     0.9780    0.9828    0.9804      8715
       rumor     0.9832    0.9784    0.9808      8955

    accuracy                         0.9806     17670
   macro avg     0.9806    0.9806    0.9806     17670
weighted avg     0.9806    0.9806    0.9806     17670



In [18]:
prediction = clf.predict(test_feature)
print(accuracy_score(test_label, prediction))
print(classification_report(test_label, prediction, labels=[0.0, 1.0], target_names=['nonrumor', 'rumor'],digits=4))

0.7580144777662875
              precision    recall  f1-score   support

    nonrumor     0.7258    0.8825    0.7965       519
       rumor     0.8185    0.6138    0.7015       448

    accuracy                         0.7580       967
   macro avg     0.7721    0.7482    0.7490       967
weighted avg     0.7687    0.7580    0.7525       967



## bagging，在训练集上进行样本加权

In [33]:
# from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [34]:
def my_adaboost_clf(Y_train, X_train, Y_test, X_test, M=20, weak_clf=LogisticRegression()):
    n_train, n_test = len(X_train), len(X_test)
    print(n_train, n_test)
    # Initialize weights
    w = np.ones(n_train) / n_train
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
    
    for i in range(M):
        # Fit a classifier with the specific weights
        weak_clf.fit(X_train, Y_train, sample_weight = w)
        pred_train_i = weak_clf.predict(X_train)
        pred_test_i = weak_clf.predict(X_test)
        print(pred_train_i.shape,pred_test_i.shape)
        print(Y_train.shape,Y_test.shape)
        
        # Indicator function
        miss = [int(x) for x in (pred_train_i != Y_train)]
        print("weak_clf_%02d train acc: %.4f" % (i + 1, 1 - sum(miss) / n_train))
        
        # Error
        err_m = np.dot(w, miss)
        # Alpha
        alpha_m = 0.5 * np.log((1 - err_m) / float(err_m))
        # New weights
        miss2 = [x if x==1 else -1 for x in miss] # -1 * y_i * G(x_i): 1 / -1
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
        w = w / sum(w)

        # Add to prediction
        pred_train_i = [1 if x == 1 else -1 for x in pred_train_i]
        pred_test_i = [1 if x == 1 else -1 for x in pred_test_i]
        pred_train = pred_train + np.multiply(alpha_m, pred_train_i)
        pred_test = pred_test + np.multiply(alpha_m, pred_test_i)
    
    pred_train = (pred_train > 0) * 1
    pred_test = (pred_test > 0) * 1

    print("My AdaBoost clf train accuracy: %.4f" % (sum(pred_train == Y_train) / n_train))
    print("My AdaBoost clf test accuracy: %.4f" % (sum(pred_test == Y_test) / n_test))

In [35]:
import numpy as np

train_feature=np.load('/home/yangtianyun/MM_course/ft-feature/trainall_feature.npy')
test_feature=np.load('/home/yangtianyun/MM_course/ft-feature/test_feature.npy')
train_label=np.load('/home/yangtianyun/MM_course/ft-feature/trainall_label.npy')[:,0]
test_label=np.load('/home/yangtianyun/MM_course/ft-feature/test_label.npy')[:,0]

In [36]:
my_adaboost_clf(train_label,train_feature,test_label,test_feature)

17670 967
(17670,) (967,)
(17670,) (967,)
weak_clf_01 train acc: 0.9768
(17670,) (967,)
(17670,) (967,)
weak_clf_02 train acc: 0.9168
(17670,) (967,)
(17670,) (967,)
weak_clf_03 train acc: 0.9098
(17670,) (967,)
(17670,) (967,)
weak_clf_04 train acc: 0.7916
(17670,) (967,)
(17670,) (967,)
weak_clf_05 train acc: 0.7246
(17670,) (967,)
(17670,) (967,)
weak_clf_06 train acc: 0.5496
(17670,) (967,)
(17670,) (967,)
weak_clf_07 train acc: 0.6549
(17670,) (967,)
(17670,) (967,)
weak_clf_08 train acc: 0.5390
(17670,) (967,)
(17670,) (967,)
weak_clf_09 train acc: 0.6686
(17670,) (967,)
(17670,) (967,)
weak_clf_10 train acc: 0.6833
(17670,) (967,)
(17670,) (967,)
weak_clf_11 train acc: 0.8233
(17670,) (967,)
(17670,) (967,)
weak_clf_12 train acc: 0.4161
(17670,) (967,)
(17670,) (967,)
weak_clf_13 train acc: 0.6009
(17670,) (967,)
(17670,) (967,)
weak_clf_14 train acc: 0.6327
(17670,) (967,)
(17670,) (967,)
weak_clf_15 train acc: 0.8043
(17670,) (967,)
(17670,) (967,)
weak_clf_16 train acc: 0.449

## 训练集随机采样，然后将在测试集上高于baseline的分类器bagging

在funetune好的feature上分类，只需要很少的样本就能分出分界面，而且分界面应该是差不多的

选择出来的在测试集准确率高的样本是什么样的

In [1]:
import numpy as np

train_feature=np.load('/home/yangtianyun/MM_course/ft-feature/trainall_feature.npy')
test_feature=np.load('/home/yangtianyun/MM_course/ft-feature/test_feature.npy')
train_label=np.load('/home/yangtianyun/MM_course/ft-feature/trainall_label.npy')[:,0]
test_label=np.load('/home/yangtianyun/MM_course/ft-feature/test_label.npy')[:,0]

In [2]:
def show_tsne(vis_x,vis_y,embeddings,Y_test,pred_test_i):
  
    A= Y_test==pred_test_i
    B= Y_test==1
    C= Y_test==0
    embeddings_rumor=embeddings[A & B]
    embeddings_nonrumor=embeddings[A & C]
    x = embeddings_rumor[:, 0]
    y = embeddings_rumor[:, 1]
    x2 = embeddings_nonrumor[:, 0]
    y2 = embeddings_nonrumor[:, 1]
    
    plt.figure(figsize=(6,6))
    plt.scatter(vis_x, vis_y, c=Y_test, cmap=plt.cm.get_cmap("jet", 2), marker='.')
    plt.scatter(x,y, marker='o',c='',edgecolors='g',s=100)
    plt.scatter(x2,y2, marker='o',c='',edgecolors='y',s=100)
    plt.show()

In [9]:
import random
from sklearn.linear_model import LogisticRegression
from MulticoreTSNE import MulticoreTSNE as TSNE
from matplotlib import pyplot as plt

def my_adaboost_clf(Y_train, X_train, Y_test, X_test, M=100):
    n_train, n_test = len(X_train), len(X_test)
    print(n_train, n_test)
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]

    rate=0.001
    w = np.ones(n_train) / n_train
    
    train_rumor_feature=X_train[Y_train==1]
    train_nonrumor_feature=X_train[Y_train==0]
    n_train_rumor=len(train_rumor_feature)
    n_train_nonrumor=len(train_nonrumor_feature)
    
    embeddings = TSNE(n_jobs=4).fit_transform(X_test)
    vis_x = embeddings[:, 0]
    vis_y = embeddings[:, 1]
    acc_max=0
    for i in range(M):
        print(i)
        # Fit a classifier with the specific weights
        weak_clf=LogisticRegression()
        train_rumor_index=random.sample(range(n_train_rumor),int(n_train_rumor*rate))
        train_nonrumor_index=random.sample(range(n_train_nonrumor),int(n_train_nonrumor*rate))
        train_rumor_sample=train_rumor_feature[train_rumor_index]
        train_nonrumor_sample=train_nonrumor_feature[train_nonrumor_index]
        
        X_train_sample=np.concatenate([train_rumor_sample,train_nonrumor_sample])
        Y_train_sample=np.concatenate([np.ones([int(n_train_rumor*rate)]),np.zeros([int(n_train_nonrumor*rate)])])
        
        weak_clf.fit(X_train_sample, Y_train_sample)
        pred_train_i_ori = weak_clf.predict(X_train)
        pred_test_i_ori = weak_clf.predict(X_test)

        # Indicator function
        miss_train = [int(x) for x in (pred_train_i_ori != Y_train)]
        #print("weak_clf_%02d train acc: %.4f" % (i + 1, 1 - sum(miss_train) / n_train))

        miss_test = [int(x) for x in (pred_test_i_ori != Y_test)]
#         print("weak_clf_%02d test acc: %.4f" % (i + 1, 1 - sum(miss_test) / n_test))
        
        # Error
#         err_m_t = np.dot(w, miss_train)
        err_m=sum(miss_test) / n_test
        

        if(err_m<0.24):
            
            
            # Alpha
            alpha_m = 0.5 * np.log((1 - err_m) / float(err_m))

            # Add to prediction
            pred_train_i = [1 if x == 1 else -1 for x in pred_train_i_ori]
            pred_test_i = [1 if x == 1 else -1 for x in pred_test_i_ori]

            pred_train = pred_train + np.multiply(alpha_m, pred_train_i)
            pred_test = pred_test + np.multiply(alpha_m, pred_test_i)

            pred_train_f = (pred_train > 0) * 1
            pred_test_f = (pred_test > 0) * 1

            acc=sum(pred_test_f == Y_test) / n_test

            if acc> acc_max:
                acc_max=acc
#                 show_tsne(vis_x,vis_y,embeddings,Y_test,pred_test_i_ori)

            print("My AdaBoost clf test accuracy: %.4f" % acc)

In [10]:
my_adaboost_clf(train_label,train_feature,test_label,test_feature)

17670 967
0
1
My AdaBoost clf test accuracy: 0.7622
2
3
4
5
My AdaBoost clf test accuracy: 0.7601
6
7
8
9
10
11
12
13
14
My AdaBoost clf test accuracy: 0.7663
15
16
17
18
19
20
21
22
23
24
25
26
27
My AdaBoost clf test accuracy: 0.7777
28
29
30
My AdaBoost clf test accuracy: 0.7756
31
32
33
My AdaBoost clf test accuracy: 0.7818
34
35
36
37
My AdaBoost clf test accuracy: 0.7766
38
My AdaBoost clf test accuracy: 0.7818
39
40
41
42
43
44
My AdaBoost clf test accuracy: 0.7797
45
46
47
48
49
50
51
52
My AdaBoost clf test accuracy: 0.7797
53
54
55
My AdaBoost clf test accuracy: 0.7818
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
My AdaBoost clf test accuracy: 0.7818
71
72
My AdaBoost clf test accuracy: 0.7787
73
74
My AdaBoost clf test accuracy: 0.7859
75
My AdaBoost clf test accuracy: 0.7828
76
77
78
My AdaBoost clf test accuracy: 0.7921
79
80
81
My AdaBoost clf test accuracy: 0.7880
82
83
84
85
86
87
88
89
90
My AdaBoost clf test accuracy: 0.7901
91
92
93
94
My AdaBoost clf test accuracy: 