### 加载数据，包括文本和label，文本去停用词和分词之后，用不同的特征提取方式和不同的模型，测试'vect__max_df'，'vect__min_df'，'vect__ngram_range'这三个参数的最优值。

#### 1.  加载训练集

In [3]:
stop_words = open('stop_words.txt','r',encoding='utf-8').readlines()
stop_words = [word.strip() for word in stop_words]
# print(stop_words)

In [4]:
import pandas
import jieba


'''
    加载训练数据集，并将文本分词
'''
data = open('train_data.txt').read()
labels, texts, = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    labels.append(content[0])
#     texts.append(content[1])
    text = content[1]
    lists = []
    for word in jieba.cut(text):
        if word != " " and word not in stop_words:
            lists.append(word)
    texts.append("/".join(lists))

In [5]:
import pandas
import jieba


'''
    加载训练数据集，并将文本分词
'''
data = open('36kr_format3.txt').read()
labels1, texts1, = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    labels1.append(content[0])
#     texts.append(content[1])
    text = content[1]
    lists = []
    for word in jieba.cut(text):
        if word != " " and word not in stop_words:
            lists.append(word)
    texts1.append("/".join(lists))

In [6]:
'''
    将之前的训练数据集扩充，加上所有剩下的36kr的正样本数据
'''
labels.extend(labels1)
texts.extend(texts1)

In [7]:
#创建一个dataframe，列名为text和label
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [8]:
print(trainDF['text'].shape)
print(trainDF['label'].shape)

(4127,)
(4127,)


In [11]:
train_x = trainDF['text']
train_y = trainDF['label']

#### 2. 加载测试集

In [15]:
'''
    加载头条全部数据的前一千条，仅有context没有label，后面用阈值预测
'''
data = open('JinRiTouTiao_format_1000.txt')
texts = []
for line in data:
    text = line.strip()
    lists = []
    for word in jieba.cut(text):
        if word != " " and word not in stop_words:
            lists.append(word)
    texts.append("/".join(lists))

#创建一个dataframe，列名为text和label
testDF = pandas.DataFrame()
testDF['text'] = texts

In [16]:
print(testDF['text'].shape)

(998,)


In [17]:
test_x = testDF['text']

In [4]:
# #将训练集分为训练集和验证集
# from sklearn import model_selection
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

In [18]:
# label编码为目标变量

from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()

train_y = encoder.fit_transform(train_y)

#### 3. 定义调参函数
#### model：训练数据的模型，eg: 朴素贝叶斯、逻辑回归、支持向量机、随机森林、xgboost等
#### vect: 特征提取方式，词频或tdidf
#### train_x: 特征
#### train_y: 标签

In [19]:
def bestParameter(model, vect, train_x, train_y):

    from time import time
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    categories = [
        '0',
        '1',
    ]

    pipeline = Pipeline([ # 选择体征提取方式和模型
        ('vect', vect),
    #     ('tfidf', TfidfTransformer()),
        ('lr', model),
    ])


    parameters = { # 调整的参数
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__min_df':(1, 2, 3),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    }

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return 0

#### 4. 测试不同模型下的最优参数

#### 4.1 MultinomialNB

#### 1. CountVectorizer

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

bestParameter(MultinomialNB(), CountVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   11.8s finished


done in 12.874s
Best score: 0.641
Best parameters set:
	vect__max_df: 0.75
	vect__min_df: 1
	vect__ngram_range: (1, 2)


0

#### 2. TfidfVectorizer

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

bestParameter(MultinomialNB(), TfidfVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   12.3s finished


done in 12.663s
Best score: 0.768
Best parameters set:
	vect__max_df: 0.5
	vect__min_df: 3
	vect__ngram_range: (1, 1)


0

#### 4.2 LogisticRegression

#### 1. CountVectorizer

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

bestParameter(LogisticRegression(), CountVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   22.4s finished


done in 23.855s
Best score: 0.778
Best parameters set:
	vect__max_df: 0.5
	vect__min_df: 1
	vect__ngram_range: (1, 2)


0

#### 2. TfidfVectorizer

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

bestParameter(LogisticRegression(), TfidfVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   14.4s finished


done in 15.517s
Best score: 0.816
Best parameters set:
	vect__max_df: 0.75
	vect__min_df: 3
	vect__ngram_range: (1, 2)


0

#### 4.3 SVC

#### 1. CountVectorizer

In [20]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

bestParameter(LinearSVC(), CountVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.7min finished


done in 117.010s
Best score: 0.943
Best parameters set:
	vect__max_df: 0.75
	vect__min_df: 1
	vect__ngram_range: (1, 2)


0

#### 2. TfidfVectorizer

In [21]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

bestParameter(LinearSVC(), TfidfVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.6min finished


done in 101.131s
Best score: 0.933
Best parameters set:
	vect__max_df: 1.0
	vect__min_df: 3
	vect__ngram_range: (1, 1)


0

#### 4.4 随机森林

#### 1. CountVectorizer

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

bestParameter(RandomForestClassifier(), CountVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    9.4s finished


done in 10.501s
Best score: 0.806
Best parameters set:
	vect__max_df: 0.75
	vect__min_df: 1
	vect__ngram_range: (1, 2)


0

#### 2. TfidfVectorizer

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

bestParameter(RandomForestClassifier(), TfidfVectorizer(), train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   12.0s finished


done in 12.373s
Best score: 0.790
Best parameters set:
	vect__max_df: 1.0
	vect__min_df: 2
	vect__ngram_range: (1, 1)


0

#### 4.5 Xgboost

#### 1. CountVectorizer

In [27]:
import xgboost
from sklearn.feature_extraction.text import CountVectorizer

bestParameter(xgboost.XGBClassifier(), CountVectorizer(), train_x, train_y)

'''
done in 32.667s
Best score: 0.808
Best parameters set:
    vect__max_df: 0.5
    vect__min_df: 1
    vect__ngram_range: (1, 1)
'''

'\ndone in 32.667s\nBest score: 0.808\nBest parameters set:\n    vect__max_df: 0.5\n    vect__min_df: 1\n    vect__ngram_range: (1, 1)\n'

#### 2. TfidfVectorizer

In [29]:
import xgboost
from sklearn.feature_extraction.text import TfidfVectorizer

bestParameter(xgboost.XGBClassifier(), TfidfVectorizer(), train_x, train_y)

'''
done in 35.071s
Best score: 0.823
Best parameters set:
    vect__max_df: 0.75
    vect__min_df: 1
    vect__ngram_range: (1, 1)
'''

'\ndone in 35.071s\nBest score: 0.823\nBest parameters set:\n    vect__max_df: 0.75\n    vect__min_df: 1\n    vect__ngram_range: (1, 1)\n'