# Raw data Statistics

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./course_project_data.csv')
len(df)

33806

In [3]:
df.head()

Unnamed: 0,id,content,picture_lists,category,fake_label
0,4028762460708675,回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：,,文体娱乐,0
1,4126670854660291,//分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....,63ad082a189566eed7c4bb3e4bc55012.jpg,社会生活,0
2,3702441948299962,西宁城管围殴民警扬言要把警察打死|西宁城管围...,4986dc2a5f09a87c7af5dfc57d7775cd.jpg,社会生活,0
3,4239549419245466,【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...,dcfccfc69e90a0007afd6aafa1385e56.jpg,社会生活,0
4,3966337217183260,支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】,,社会生活,0


In [4]:
df['category'].value_counts()

社会生活    20616
医药健康     6320
文体娱乐     2620
财经商业     1455
政治       1241
教育考试      901
军事        372
科技        281
Name: category, dtype: int64

# Imbalanced 8 category

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

SEED = 0
TEST_RATIO = 0.1

In [2]:
main6_we = np.load('../Semantics/data/train_we_(32193, 120).npy')
main6_label = np.load('./data/train_category_label_(32193, 8).npy')
other2_we = np.load('../Semantics/data/test_we_(1613, 120).npy')
other2_label = np.load('./data/test_category_label_(1613, 8).npy')

main6_we.shape, main6_label.shape, other2_we.shape, other2_label.shape

((32193, 120), (32193, 8), (1613, 120), (1613, 8))

In [3]:
def split_stratified(X, y):
    return train_test_split(X, y, test_size=TEST_RATIO, stratify=y, random_state=SEED)

In [6]:
X_train_main6, X_test_main6, y_train_main6, y_test_main6 = split_stratified(main6_we, main6_label)
X_train_other2, X_test_other2, y_train_other2, y_test_other2 = split_stratified(other2_we, other2_label)

In [7]:
X_train = np.concatenate([X_train_main6, X_train_other2])
X_test = np.concatenate([X_test_main6, X_test_other2])
y_train = np.concatenate([y_train_main6, y_train_other2])
y_test = np.concatenate([y_test_main6, y_test_other2])

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((30424, 120), (3382, 120), (30424, 8), (3382, 8))

In [8]:
def check_label(y):
    y = y.argmax(axis=1)
    for i in range(8):
        print('{}_sz = {}'.format(i, len(y[y==i])))

In [9]:
check_label(y_train)

0_sz = 5688
1_sz = 811
2_sz = 2358
3_sz = 18554
4_sz = 253
5_sz = 1309
6_sz = 335
7_sz = 1116


In [11]:
check_label(y_test)

0_sz = 632
1_sz = 90
2_sz = 262
3_sz = 2062
4_sz = 28
5_sz = 146
6_sz = 37
7_sz = 125


In [12]:
社会生活    20616
医药健康     6320
文体娱乐     2620
财经商业     1455
政治       1241
教育考试      901
军事        372
科技        281

In [None]:
政治      400
军事      372
其他      400

In [14]:
np.save('./data/category_imbalanced8_X_train_{}.npy'.format(X_train.shape), X_train)
np.save('./data/category_imbalanced8_X_test_{}.npy'.format(X_test.shape), X_test)
np.save('./data/category_imbalanced8_y_train_{}.npy'.format(y_train.shape), y_train)
np.save('./data/category_imbalanced8_y_test_{}.npy'.format(y_test.shape), y_test)

# Balanced 3 category：军事/政治/其他

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

SEED = 0

In [2]:
main6_we = np.load('../Semantics/data/train_we_(32193, 120).npy')
main6_label = np.load('./data/train_category_label_(32193, 8).npy')
other2_we = np.load('../Semantics/data/test_we_(1613, 120).npy')
other2_label = np.load('./data/test_category_label_(1613, 8).npy')

main6_we.shape, main6_label.shape, other2_we.shape, other2_label.shape

((32193, 120), (32193, 8), (1613, 120), (1613, 8))

In [31]:
def check_label(y):
    sz = y.shape[1]
    y = y.argmax(axis=1)
    for i in range(sz):
        print('{}_sz = {}'.format(i, len(y[y==i])))

In [4]:
check_label(main6_label)

0_sz = 6320
1_sz = 901
2_sz = 2620
3_sz = 20616
4_sz = 281
5_sz = 1455
6_sz = 0
7_sz = 0


In [5]:
check_label(other2_label)

0_sz = 0
1_sz = 0
2_sz = 0
3_sz = 0
4_sz = 0
5_sz = 0
6_sz = 372
7_sz = 1241


In [6]:
def split_stratified(X, y, ratio):
    return train_test_split(X, y, test_size=ratio, stratify=y, random_state=SEED)

## 其他6类中按比例抽取400条

In [7]:
_, main6_400, _, main6_400_label = split_stratified(main6_we, main6_label, ratio=400/len(main6_label))

In [8]:
main6_400.shape, main6_400_label.shape

((400, 120), (400, 8))

In [9]:
check_label(main6_400_label)

0_sz = 79
1_sz = 11
2_sz = 33
3_sz = 256
4_sz = 3
5_sz = 18
6_sz = 0
7_sz = 0


## 在政治类中抽取400条

In [13]:
other2_label[:10]

array([[0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [15]:
other2_label[2].argmax()

7

In [20]:
politics = []

for i, y in enumerate(other2_label):
    if y.argmax() == 7:
        politics.append(i)

other2_label[politics].shape

(1241, 8)

In [19]:
politics_400 = other2_we[politics[:400]]
politics_400_label = other2_label[politics[:400]]

politics_400.shape, politics_400_label.shape

((400, 120), (400, 8))

## 军事

In [21]:
military = []

for i, y in enumerate(other2_label):
    if y.argmax() == 6:
        military.append(i)

other2_label[military].shape

(372, 8)

In [22]:
military_all = other2_we[military]
military_all_label = other2_label[military]

military_all.shape, military_all_label.shape

((372, 120), (372, 8))

## 拼接&导出

In [23]:
main6_400.shape, politics_400.shape, military_all.shape

((400, 120), (400, 120), (372, 120))

In [24]:
balanced3 = np.concatenate([main6_400, politics_400, military_all])
balanced3.shape

(1172, 120)

In [27]:
balanced3_label_raw = np.concatenate([main6_400_label, politics_400_label, military_all_label])
balanced3_label_raw.shape

(1172, 8)

In [28]:
balanced3_label_raw[:5]

array([[0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.]])

In [38]:
balanced3_label = np.zeros((len(balanced3_label_raw), 3))

for i, y in enumerate(balanced3_label_raw):
    label = y.argmax()
    # 军事
    if label == 6:
        balanced3_label[i][0] = 1
    # 政治
    elif label == 7:
        balanced3_label[i][1] = 1
    else:
        balanced3_label[i][2] = 1

balanced3_label.shape

(1172, 3)

In [39]:
check_label(balanced3_label)

0_sz = 372
1_sz = 400
2_sz = 400


In [40]:
balanced3.shape, balanced3_label.shape

((1172, 120), (1172, 3))

In [47]:
# balanced3_label[-380:-370]
# balanced3_label_raw[-380:-370]

In [49]:
X_train, X_test, y_train, y_test = split_stratified(balanced3, balanced3_label, ratio=0.1)

In [50]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1054, 120), (118, 120), (1054, 3), (118, 3))

In [51]:
np.save('./data/category_balanced3_X_train_{}.npy'.format(X_train.shape), X_train)
np.save('./data/category_balanced3_X_test_{}.npy'.format(X_test.shape), X_test)
np.save('./data/category_balanced3_y_train_{}.npy'.format(y_train.shape), y_train)
np.save('./data/category_balanced3_y_test_{}.npy'.format(y_test.shape), y_test)