In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_csv('../data/train_set.csv', sep='\t')
test_df = pd.read_csv('../data/test_a.csv', sep='\t')

In [3]:
%%time
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1,3),
    max_features=10000)

tfidf.fit(pd.concat([train_df['text'], test_df['text']]))
train_word_features = tfidf.transform(train_df['text'])
test_word_features = tfidf.transform(test_df['text'])

Wall time: 19min 53s


In [4]:
X_train = train_word_features
y_train = train_df['label']
X_test = test_word_features

In [30]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
KF = KFold(n_splits=10, random_state=7)
clf = LinearSVC()
test_pred = np.zeros((X_test.shape[0], 1), int)  # 存储测试集预测结果 行数：len(X_test) ,列数：1列
for KF_index, (train_index,valid_index) in enumerate(KF.split(X_train)):
    print('第', KF_index+1, '折交叉验证开始...')
    # 训练集划分
    x_train_, x_valid_ = X_train[train_index], X_train[valid_index]
    y_train_, y_valid_ = y_train[train_index], y_train[valid_index]
    # 模型构建
    clf.fit(x_train_, y_train_)
    # 模型预测
    val_pred = clf.predict(x_valid_)
    print("LinearSVC准确率为：",f1_score(y_valid_, val_pred, average='macro'))
    # 保存测试集预测结果
    test_pred = np.column_stack((test_pred, clf.predict(X_test)))  # 将矩阵按列合并
# 取测试集中预测数量最多的数
preds = []
for i, test_list in enumerate(test_pred):
    preds.append(np.argmax(np.bincount(test_list)))
preds = np.array(preds)

第 1 折交叉验证开始...
LinearSVC准确率为： 0.9387828343529744
第 2 折交叉验证开始...
LinearSVC准确率为： 0.935905962059219
第 3 折交叉验证开始...
LinearSVC准确率为： 0.9437648412043526
第 4 折交叉验证开始...
LinearSVC准确率为： 0.9416286387242435
第 5 折交叉验证开始...
LinearSVC准确率为： 0.9391949963348126
第 6 折交叉验证开始...
LinearSVC准确率为： 0.9366023492800833
第 7 折交叉验证开始...
LinearSVC准确率为： 0.9406770075236407
第 8 折交叉验证开始...
LinearSVC准确率为： 0.935780601199901
第 9 折交叉验证开始...
LinearSVC准确率为： 0.9429779405504484
第 10 折交叉验证开始...
LinearSVC准确率为： 0.9429735075282978


In [32]:
submission = pd.read_csv('../data/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('../output/LinearSVC_submission2.csv', index=False)

https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html