## 加载训练集正负样本，用svm+tfidf建模，预测测试集，f1在86%左右。

### 1. 加载停用词

In [2]:
stop_words = open('stop_words.txt','r',encoding='utf-8').readlines()
stop_words = [word.strip() for word in stop_words]
# print(stop_words)

### 2. 加载训练集

In [69]:
'''
    查看训练集样本情况
'''
with open("./Total_Train_Data.txt", "r", encoding="utf-8") as data:
    positive_label = 0
    negative_label = 0
    total_label = 0 
    for line in data:
        if line:
            line = line.strip()
            label = line.split("\t")[0]
            if label == "0":
                positive_label += 1
            if label == "1":
                negative_label += 1
            total_label += 1
print("总样本%d个" % total_label)
print("正样本%d个，占比%f" % (positive_label, (positive_label / total_label)))
print("负样本%d个，占比%f" % (negative_label, (negative_label / total_label)))

总样本99890个
正样本25965个，占比0.259936
负样本73925个，占比0.740064


In [75]:
'''
    加载训练数据集并分词，耗时半小时左右；
    训练集里有一行空格； 
'''
import jieba
import time

start = time.time()

data = open('Total_Train_Data.txt').read()
labels, texts, = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    if len(content) == 2:
        labels.append(content[0])
        text = content[1]
        lists = []
        for word in jieba.cut(text):
            if word != " " and word not in stop_words:
                lists.append(word)
        texts.append("/".join(lists))
    else:
        print(i+1)
        print(content)
        
end = time.time()
running_time = end-start

print(running_time)

99891
['']
1803.5141744613647


In [77]:
'''
    创建一个dataframe，列名为text和label
'''

import pandas
from sklearn import preprocessing

trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

train_x = trainDF['text']

train_y = trainDF['label']
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)

In [207]:
print(train_x.shape)
print(train_y.shape)

(99890,)
(99890,)


In [81]:
'''
    训练集共分词55619034个， 非重复的分词有1047925个；
'''
lens = []
sums = 0
for i in range(len(trainDF['text'])):
    data = trainDF['text'][i].split("/")
    sums += len(data)
    lens.extend(data) 
print(sums)
print(len(set(lens)))

55619034
1047925


### 3.加载测试集

In [208]:
'''
    重新整理测试集的格式
'''

from sklearn import preprocessing

test_data = open('TouTiao_testdata1_format.txt', 'r', encoding='utf-8')

test_texts = []
test_labels = []

for line in test_data:
    line = line.strip()

    label = line.split("\t")[0]
    test_labels.append(label)

    text = line.split("\t")[1]
    lists = []
    for word in jieba.cut(text):
        if word != " " and word not in stop_words:
            lists.append(word)
    test_texts.append("/".join(lists))

        
#创建一个dataframe，列名为text和label
testDF = pandas.DataFrame()
testDF['text'] = test_texts
testDF['label'] = test_labels
test_x = testDF['text']

test_y = testDF['label']
encoder = preprocessing.LabelEncoder()
test_y = encoder.fit_transform(test_y)

In [209]:
print(test_x.shape)
print(test_y.shape)

(199,)
(199,)


### 4. SVM + tf-idf建模

#### 4.1 测试百度的500条数据

In [221]:
'''
    设置在全部corpus(训练集)中至少出现两次的词为一个特征；
    考虑1-gram和2-gram；
    设置使用的特征是相关性最大的前100万个特征；若不设置，默认使用现有的corpus中的500万多个特征，耗时久，而且准确度几乎没有提升；
'''
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.externals import joblib

start = time.time()

tfidf_vect_ngram = TfidfVectorizer(min_df=2, max_df=1.0, 
                                   ngram_range=(1, 2), max_features=1000000) #  token_pattern='(?u)\\b\\w+\\b' 去掉单独的汉字
tfidf_vect_ngram.fit(trainDF['text'])


joblib.dump(tfidf_vect_ngram, 'tfidf_vect_ngram.pkl') 


xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)

print("特征维度：%s" % xtrain_tfidf_ngram.shape[1])

svm = LinearSVC()

svm.fit(xtrain_tfidf_ngram, train_y)

joblib.dump(svm, 'svm.pkl')

y_prediction = svm.predict(xtest_tfidf_ngram)

print(classification_report(y_true=test_y, y_pred=y_prediction))

end = time.time()
print("历时：", (end-start))

特征维度：1000000
             precision    recall  f1-score   support

          0       0.81      0.64      0.72        53
          1       0.88      0.95      0.91       146

avg / total       0.86      0.86      0.86       199

历时： 534.3831670284271


#### 4.2 测试今日头条的199条数据

In [210]:
'''
    设置在全部corpus(训练集)中至少出现两次的词为一个特征；
    考虑1-gram和2-gram；
    设置使用的特征是相关性最大的前100万个特征；若不设置，默认使用现有的corpus中的500万多个特征，耗时久，而且准确度几乎没有提升；
'''
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import time

start = time.time()

tfidf_vect_ngram = TfidfVectorizer(min_df=2, max_df=1.0, 
                                   ngram_range=(1, 2), max_features=1000000) #  token_pattern='(?u)\\b\\w+\\b' 去掉单独的汉字
tfidf_vect_ngram.fit(trainDF['text'])

print("get_feature_names()的长度：", len(tfidf_vect_ngram.get_feature_names()))

xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)

print("特征维度：%s" % xtrain_tfidf_ngram.shape[1])

# xtrain_count, xtest_count = tfIdf(2, 1.0, (1, 2))

svm = LinearSVC()

svm.fit(xtrain_tfidf_ngram, train_y)

y_prediction = svm.predict(xtest_tfidf_ngram)

print(classification_report(y_true=test_y, y_pred=y_prediction))

end = time.time()
print("历时：", (end-start))

get_feature_names()的长度： 1000000
特征维度：1000000
             precision    recall  f1-score   support

          0       0.81      0.64      0.72        53
          1       0.88      0.95      0.91       146

avg / total       0.86      0.86      0.86       199

历时： 361.1466934680939


In [5]:
'''
    利用词性特征
'''
import jieba.posseg as pseg
from sklearn.feature_extraction.text import TfidfVectorizer

tests = ['查了很多资料还是不知道怎么实现',
'结巴分词跟中科院分词都能提取出词性',
'可是不知道怎么跟sklearn结合起来用']

def tag_trans(s):
    words = pseg.cut(s)
    return ' '.join([w.flag for w in words])

tag_texts = [tag_trans(s) for s in tests]

print(tag_texts)
cvec = TfidfVectorizer(tokenizer=lambda x:str(x).split(' '))
tag_vec = cvec.fit_transform(tag_texts).toarray()
print(cvec.vocabulary_)
print(tag_vec)

['v ul m n c d v r v', 'n n p nt n d v v v n', 'c d v r p eng v v p']
{'v': 9, 'ul': 8, 'm': 3, 'n': 4, 'c': 0, 'd': 1, 'r': 7, 'p': 6, 'nt': 5, 'eng': 2}
[[0.28297035 0.21975172 0.         0.37207201 0.28297035 0.
  0.         0.28297035 0.37207201 0.65925516]
 [0.         0.15606936 0.         0.         0.80387084 0.26424839
  0.20096771 0.         0.         0.46820807]
 [0.26958373 0.20935581 0.3544702  0.         0.         0.
  0.53916745 0.26958373 0.         0.62806744]]
