# 数据加载

In [1]:
import numpy as np

In [2]:
con_se = np.load('./data/content_se_(4663, 55).npy')
com_se = np.load('./data/pure_comments_100_se_(4663, 110).npy')
label = np.load('./data/labels_(4663,).npy')

con_se.shape, com_se.shape, label.shape

((4663, 55), (4663, 110), (4663,))

In [3]:
ratio = 0.8

rumor_sz = 2312
truth_sz = 2351
rumor_sz + truth_sz


def split_dataset(arr):
    assert len(arr) == rumor_sz + truth_sz

    train_pos_sz = int(ratio * rumor_sz)
    train_neg_sz = int(ratio * truth_sz)

    train_pos_arr = arr[:train_pos_sz]
    test_pos_arr = arr[train_pos_sz:rumor_sz]
    train_neg_arr = arr[rumor_sz:(rumor_sz + train_neg_sz)]
    test_neg_arr = arr[(rumor_sz + train_neg_sz):]

    train_arr = np.concatenate([train_pos_arr, train_neg_arr], axis=0)
    test_arr = np.concatenate([test_pos_arr, test_neg_arr], axis=0)

    print(train_arr.shape, test_arr.shape)

    return train_arr, test_arr

In [4]:
con_se.shape, com_se.shape

((4663, 55), (4663, 110))

In [5]:
se = np.concatenate([con_se, com_se], axis=1)
se.shape

(4663, 165)

# RandomForest

In [6]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *

In [7]:
def train(se, label, seed=0):    
    X_train, X_test = split_dataset(se)
    y_train, y_test = split_dataset(label)
    
    clf = RandomForestClassifier(random_state=seed)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    test_label = y_test
    
    auc = roc_auc_score(test_label, y_pred)

    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0

    accuracy = accuracy_score(test_label, y_pred)
    eval_dict = classification_report(test_label, y_pred, labels=[0, 1],
                                      target_names=['truth', 'rumor'], output_dict=True)

    print()
    print('TEST_sz:', len(test_label))
    print('test: {}+, {}-'.format(int(sum(test_label)), int(len(test_label) - sum(test_label))))
    print()
    print('Accuracy: {}'.format(accuracy))
    print('AUC: {}'.format(auc))
    print('Confusion Matrix:\n {}'.format(confusion_matrix(test_label, y_pred)))
    print()
    print(classification_report(test_label, y_pred, labels=[0, 1],
                                target_names=['truth', 'rumor'], digits=3))
    print()

In [8]:
train(con_se, label, seed=6)

(3729, 55) (934, 55)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.6755888650963597
AUC: 0.6748221925685436
Confusion Matrix:
 [[360 111]
 [192 271]]

              precision    recall  f1-score   support

       truth      0.652     0.764     0.704       471
       rumor      0.709     0.585     0.641       463

    accuracy                          0.676       934
   macro avg      0.681     0.675     0.673       934
weighted avg      0.681     0.676     0.673       934






In [9]:
train(com_se, label, seed=6)

(3729, 110) (934, 110)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.7676659528907923
AUC: 0.767713105244574
Confusion Matrix:
 [[359 112]
 [105 358]]

              precision    recall  f1-score   support

       truth      0.774     0.762     0.768       471
       rumor      0.762     0.773     0.767       463

    accuracy                          0.768       934
   macro avg      0.768     0.768     0.768       934
weighted avg      0.768     0.768     0.768       934






In [10]:
train(se, label, seed=0)

(3729, 165) (934, 165)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.7762312633832976
AUC: 0.776150646801759
Confusion Matrix:
 [[370 101]
 [108 355]]

              precision    recall  f1-score   support

       truth      0.774     0.786     0.780       471
       rumor      0.779     0.767     0.773       463

    accuracy                          0.776       934
   macro avg      0.776     0.776     0.776       934
weighted avg      0.776     0.776     0.776       934






In [11]:
train(com_se[:, :55], label, seed=6)

(3729, 55) (934, 55)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.7387580299785867
AUC: 0.7388305750826558
Confusion Matrix:
 [[344 127]
 [117 346]]

              precision    recall  f1-score   support

       truth      0.746     0.730     0.738       471
       rumor      0.732     0.747     0.739       463

    accuracy                          0.739       934
   macro avg      0.739     0.739     0.739       934
weighted avg      0.739     0.739     0.739       934






In [12]:
train(com_se[:, 55:], label, seed=6)

(3729, 55) (934, 55)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.7237687366167024
AUC: 0.7237484695491877
Confusion Matrix:
 [[342 129]
 [129 334]]

              precision    recall  f1-score   support

       truth      0.726     0.726     0.726       471
       rumor      0.721     0.721     0.721       463

    accuracy                          0.724       934
   macro avg      0.724     0.724     0.724       934
weighted avg      0.724     0.724     0.724       934






# DecisionTree

In [28]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *

In [29]:
def train(se, label, seed=0):    
    X_train, X_test = split_dataset(se)
    y_train, y_test = split_dataset(label)
    
    dt = DecisionTreeClassifier(random_state=seed)
    dt.fit(X_train, y_train)

    y_pred = dt.predict(X_test)
    test_label = y_test
    
    auc = roc_auc_score(test_label, y_pred)

    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0

    accuracy = accuracy_score(test_label, y_pred)
    eval_dict = classification_report(test_label, y_pred, labels=[0, 1],
                                      target_names=['truth', 'rumor'], output_dict=True)

    print()
    print('TEST_sz:', len(test_label))
    print('test: {}+, {}-'.format(int(sum(test_label)), int(len(test_label) - sum(test_label))))
    print()
    print('Accuracy: {}'.format(accuracy))
    print('AUC: {}'.format(auc))
    print('Confusion Matrix:\n {}'.format(confusion_matrix(test_label, y_pred)))
    print()
    print(classification_report(test_label, y_pred, labels=[0, 1],
                                target_names=['truth', 'rumor'], digits=3))
    print()
    
    return dt

In [30]:
con_dt = train(con_se, label, seed=4)

(3729, 55) (934, 55)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.6156316916488223
AUC: 0.6158327715948329
Confusion Matrix:
 [[279 192]
 [167 296]]

              precision    recall  f1-score   support

       truth      0.626     0.592     0.609       471
       rumor      0.607     0.639     0.623       463

    accuracy                          0.616       934
   macro avg      0.616     0.616     0.616       934
weighted avg      0.616     0.616     0.615       934




In [31]:
com_dt = train(com_se, label, seed=4)

(3729, 110) (934, 110)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.721627408993576
AUC: 0.7218270945967635
Confusion Matrix:
 [[329 142]
 [118 345]]

              precision    recall  f1-score   support

       truth      0.736     0.699     0.717       471
       rumor      0.708     0.745     0.726       463

    accuracy                          0.722       934
   macro avg      0.722     0.722     0.722       934
weighted avg      0.722     0.722     0.722       934




In [32]:
dt = train(se, label, seed=0)

(3729, 165) (934, 165)
(3729,) (934,)

TEST_sz: 934
test: 463+, 471-

Accuracy: 0.7312633832976445
AUC: 0.7314179196874442
Confusion Matrix:
 [[336 135]
 [116 347]]

              precision    recall  f1-score   support

       truth      0.743     0.713     0.728       471
       rumor      0.720     0.749     0.734       463

    accuracy                          0.731       934
   macro avg      0.732     0.731     0.731       934
weighted avg      0.732     0.731     0.731       934




In [31]:
np.argsort(com_dt.feature_importances_)

array([109,  86,  85,  84,  83,  82,  81,  80,  79,  78,  77,  76,  75,
        74,  73,  72,  71,  70,  69,  68,  67,  66,  65,  64,  58,  59,
        60,  61,  62,  30,  31,  28,   3,   4,   5,   6,   7,   8,   9,
        10,  11,  12,  13,  29,  15,  14,  17,  27,  26,  25,  16,  23,
        24,  21,  20,  19,  18,  22,  63,  49,  92,  91, 104,  90,  54,
        56,  47, 101,  53, 103,  45,  42,  88,  35,  95,  94,  93, 100,
        99, 108,  40,  57, 102,  97,  55,   0, 106,  46,  52,  48,  89,
        98,  44,  51,  36,  32,  96, 107,   2,  37,  87,   1,  50,  43,
        33,  34, 105,  38,  41,  39])

In [33]:
com_dt.feature_importances_

array([ 0.00944024,  0.01989879,  0.0171242 ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.0134755 ,  0.03236474,  0.04305343,
        0.0065292 ,  0.01289358,  0.01765296,  0.06642155,  0.31316515,
        0.00775281,  0.09453723,  0.00562387,  0.02093466,  0.01134487,
        0.00555912,  0.00987533,  0.00409249,  0.01038419,  0.00080456,
        0.02015801,  0.01237692,  0.00997814,  0.00494156,  0.0030341 ,
        0.00941454,  0.00398543,  0.00799169,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  