In [5]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive"


Mounted at /content/drive


In [9]:
import os
path = "/content/drive/My Drive//NLP/NLP_30Days/"
os.chdir(path)
os.listdir(path)

['day02.ipynb',
 'train_set.csv',
 'test_a.csv',
 'train_set.csv.zip',
 'day01.ipynb']

### $任务3：使用 TFIDF 提取文本特征$

1. 学习TFIDF的原理

2. 学会使用CountVectorizer

3. 学会使用TfidfVectorizer

#### $TFIDF原理$

TfidfVectorizer()基于TF-IDF算法。此算法包括两部分TF和IDF，两者相乘得到TF-IDF算法。

TF算法统计某训练文本中，某个词的出现次数，计算公式如下：

$$词频TF = \frac{某个词在单个文本中出现的次数}{所有文本的总词数}$$

IDF算法用于调整词频的权重系数，如果一个词越常见，那么分母就越大，逆文档频率就越小越接近0.

$$逆文档率IDF = \frac{文本总个数}{包含改词的文本个数}$$

sklearn中IDF的计算公式与一般书中介绍的不一样:
$$逆文档率IDF(X) = log\frac{N+1}{N(x)+1} + 1 $$
N=训练集文本总数, N(x)=包含词x的文本数

#### $CountVectorizer的使用$

Bag of Words（词袋表示），也称为Count Vectors，每个文档的字/词可以使用其出现次数来进行表示。

在sklearn中可以直接CountVectorizer来实现这一步骤：



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [None]:
vectorizer = CountVectorizer()
vec_fit = vectorizer.fit_transform(corpus)

In [None]:
print(f'语料单词及编号：')
print(vectorizer.vocabulary_)
print(sorted(vectorizer.vocabulary_.items(),key=lambda x : x[1]))

语料单词及编号：
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[('and', 0), ('document', 1), ('first', 2), ('is', 3), ('one', 4), ('second', 5), ('the', 6), ('third', 7), ('this', 8)]


In [None]:
print(f'每个文本含有的单词情况：')
print(f'{vec_fit}')  #(0,8)表示第一个本文'This is the first document.'含有编号8的单词（this）个数为1,其他依次类推

每个文本含有的单词情况：
  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


In [None]:
print(f'文本单词频数矩阵:')
print(f'{vec_fit.toarray()}') #矩阵第一行表示含有编号1、2、3、6、8的单词频数均为1,其他依次类推

文本单词频数矩阵:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


#### $TfidfVectorizer的使用$

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [None]:
tfidf  = TfidfVectorizer(norm=None)  #norm为归一化参数,norm='l2’范数时，就是对文本向量进行归一化。
tfidf_fit = tfidf.fit_transform(corpus)


In [None]:
print(f'语料单词及编号：')
print(f'{tfidf.vocabulary_}')
print(f'{sorted(tfidf.vocabulary_.items(),key=lambda x : x[1])}')

语料单词及编号：
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[('and', 0), ('document', 1), ('first', 2), ('is', 3), ('one', 4), ('second', 5), ('the', 6), ('third', 7), ('this', 8)]


In [None]:
import numpy as np
print(f'IDF值：')
print(f'{np.round(tfidf.idf_,3)}')
print(f'{tfidf.idf_.shape}')

IDF值：
[1.916 1.223 1.511 1.    1.916 1.916 1.    1.916 1.   ]
(9,)


In [None]:
print(f'TF-IDF值：')
print(f'{np.round(tfidf_fit.toarray(),3)}')

TF-IDF值：
[[0.    1.223 1.511 1.    0.    0.    1.    0.    1.   ]
 [0.    2.446 0.    1.    0.    1.916 1.    0.    1.   ]
 [1.916 0.    0.    1.    1.916 0.    1.    1.916 1.   ]
 [0.    1.223 1.511 1.    0.    0.    1.    0.    1.   ]]


In [None]:
print(f'TF矩阵：')
print(f'{vec_fit.toarray()}')

TF矩阵：
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [None]:
print(f'计算 document 的idf值：')
print(f'{np.log((4+1)/(3+1))+1}') #包含document的文本有3个，总文本数为4个，根据idf公式计算

计算document的idf值：
1.2231435513142097


In [None]:
print(f'计算tf * idf:')
print(f'{vec_fit.toarray() * np.round(tfidf.idf_,3 )}')

计算tf * idf:
[[0.    1.223 1.511 1.    0.    0.    1.    0.    1.   ]
 [0.    2.446 0.    1.    0.    1.916 1.    0.    1.   ]
 [1.916 0.    0.    1.    1.916 0.    1.    1.916 1.   ]
 [0.    1.223 1.511 1.    0.    0.    1.    0.    1.   ]]


In [None]:
#选择norm='l2'对文本向量进行归一化
tfidf_l2 = TfidfVectorizer(norm='l2')
tfidf_l2_fit = tfidf_l2.fit_transform(corpus)

In [None]:
print(f'{np.round(tfidf_l2.idf_,3)}')

[1.916 1.223 1.511 1.    1.916 1.916 1.    1.916 1.   ]


In [None]:
print(f'{np.round(tfidf_l2_fit.toarray(),3)}')

[[0.    0.47  0.58  0.384 0.    0.    0.384 0.    0.384]
 [0.    0.688 0.    0.281 0.    0.539 0.281 0.    0.281]
 [0.512 0.    0.    0.267 0.512 0.    0.267 0.512 0.267]
 [0.    0.47  0.58  0.384 0.    0.    0.384 0.    0.384]]


In [None]:
print(f'{vec_fit.toarray()}')

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [None]:
print(f'{tfidf_l2_fit}')

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


### $任务4：使用 TFIDF 特征 和 线性模型完成训练和预测$

1. 使用TFIDF提取训练集和测试集特征

2. 使用线性模型（LR等）完成模型的训练和预测

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier,LogisticRegression
from tqdm import tqdm
from sklearn.metrics import f1_score,accuracy_score
import pandas as pd

In [11]:
train_df = pd.read_csv('./train_set.csv', sep='\t')
test_df = pd.read_csv('./test_a.csv', sep='\t')

In [12]:
train_df.shape

(200000, 2)

In [13]:
test_df.shape

(50000, 1)

In [15]:
train_text = train_df['text']
test_text = test_df['text']
all_text = pd.concat([train_text, test_text])
all_text.shape

(250000,)

In [16]:
%%time
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1,1),
    max_features=10000)

tfidf_fit = tfidf.fit(all_text)
train_word_features = tfidf_fit.transform(train_text)
test_word_features = tfidf_fit.transform(test_text)
train_word_features

CPU times: user 4min 3s, sys: 1.43 s, total: 4min 4s
Wall time: 4min 4s


In [18]:
#准备训练集、验证集与测试集
X_train = train_word_features
y_train = train_df['label']

#切分训练集与验证集
x_train_, x_valid_, y_train_, y_valid_ = train_test_split(X_train, y_train, test_size=0.2)
X_test = test_word_features

In [21]:
#TF-IDF + LR
%%time
clf = LogisticRegression(C=4, n_jobs=16) #设置惩罚项C=4
clf.fit(x_train_, y_train_)

y_pred = clf.predict(x_valid_)
train_scores = clf.score(x_train_, y_train_)
print(train_scores, f1_score(y_pred, y_valid_, average='macro'))

0.94926875 0.917841513930204
CPU times: user 1.57 s, sys: 937 ms, total: 2.51 s
Wall time: 2min 10s


In [27]:
#TF-IDF + Ridge
%%time
clf = RidgeClassifier(alpha=4)
clf.fit(x_train_, y_train_)

y_pred = clf.predict(x_valid_)
train_scores = clf.score(x_train_, y_train_)
print(train_scores, f1_score(y_pred, y_valid_, average='macro'))

0.9237375 0.8912840859180118


### $任务5：使用 TFIDF 特征 和 XGBoost完成训练和预测$

1. 使用TFIDF提取训练集和测试集特征

2. 使用XGBoost完成模型的训练和预测

In [28]:
#TF-IDF + XGBOOST
from xgboost.sklearn import XGBClassifier

In [29]:
class XGB():

  def __init__(self, X_df, y_df):
    self.X = X_df
    self.y = y_df
      
  def train(self, param):
    self.model = XGBClassifier(**param)
    self.model.fit(self.X, self.y, eval_set=[(self.X, self.y)],
            eval_metric=['mlogloss'],
            early_stopping_rounds=10,  # 连续N次分值效果没有提升，则停止训练
            verbose=False
            )
      
  # mode evaluation
    train_result, train_proba = self.model.predict(self.X), self.model.predict_proba(self.X)
    train_acc = accuracy_score(self.y, train_result)
    train_auc = f1_score(self.y, train_proba, average='macro')
    
    print("Train acc: %.2f%% Train auc: %.2f" % (train_acc*100.0, train_auc))
      
  def test(self, X_test, y_test):
    result, proba = self.model.predict(X_test), self.model.predict_proba(X_test)
    acc = accuracy_score(y_test, result)
    f1 = f1_score(y_test, proba, average='macro')
    
    print("acc: %.2f%% F1_score: %.2f%%" % (acc*100.0, f1))

  def grid(self, param_grid):
    self.param_grid = param_grid
    xgb_model = XGBClassifier(nthread=20)
    clf = GridSearchCV(xgb_model, self.param_grid, scoring='f1_macro', cv=2, verbose=1)
    clf.fit(self.X, self.y)
    print("Best score: %f using parms: %s" % (clf.best_score_, clf.best_params_))
    return clf.best_params_, clf.best_score_


In [30]:
# 选取前500特征入模型
x_train_, x_valid_, y_train_, y_valid_ = train_test_split(X_train[:, :500], y_train, test_size=0.2, shuffle=True, random_state=42)
X_test = test_word_features[:,:500]

In [None]:
%%time
param = {'learning_rate': 0.05, #(xgb’s “eta”)
      'objective': 'multi:softmax', 
      'n_jobs': 16,
      'n_estimators': 300, #树的个数
      'max_depth': 10,               
      'gamma': 0.5,   #惩罚项中叶子结点个数前的参数，Increasing this value will make model more conservative.
      'reg_alpha': 0,  #L1 regularization term on weights.Increasing this value will make model more conservative.
      'reg_lambda': 2,  #L2 regularization term on weights.Increasing this value will make model more conservative.
      'min_child_weight' : 1, #叶子节点最小权重
      'subsample':0.8,     #随机选择80%样本建立决策树
      'random_state':1     #随机数
      }
model = XGB(x_train_, y_train_)
model.train(param)
model.test(x_valid_, y_valid_)

In [None]:
xgb_model = XGB(X_train, y_train)
xgb_model.train(param)

submission = pd.read_csv('./test_a_sample_submit.csv')
preds = xgb_model.model.predict(X_test)
submission['label'] = preds
submission.to_csv('./xgb_submission.csv', index=False)