## 数据构建

In [1]:
import pandas as pd
data=pd.read_excel('data/chinese/复旦大学中文文本分类语料.xlsx','sheet1') 


In [7]:
data.head()

Unnamed: 0,分类,正文
0,艺术,﻿【 文献号 】1-2432\n【原文出处】出版发行研究\n【原刊地名】京\n【原刊期号】1...
1,艺术,﻿【 文献号 】1-2435\n【原文出处】扬州师院学报：社科版\n【原刊期号】199504...
2,艺术,﻿【 文献号 】1-2785\n【原文出处】南通师专学报：社科版\n【原刊期号】199503...
3,艺术,﻿【 文献号 】1-3021\n【原文出处】社会科学战线\n【原刊地名】长春\n【原刊期号】...
4,艺术,﻿【 文献号 】1-3062\n【原文出处】上海文化\n【原刊期号】199505\n【原刊页...


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9249 entries, 0 to 9248
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   分类      9249 non-null   object
 1   正文      9249 non-null   object
dtypes: object(2)
memory usage: 144.6+ KB


In [9]:
data.分类.unique()

array(['艺术', '文学', '哲学', '通信', '能源', '历史', '矿藏', '空间', '教育', '交通', '计算机',
       '环境', '电子', '农业', '体育', '时政', '医疗', '经济', '法律'], dtype=object)

In [None]:
# 对文本数据的正文字段进行分词，这里是在Linux上运行的，可以开启jieba的并行分词模式，分词速度是平常的好多倍，具体看你的CPU核心数
# 设置可靠的自定义词典，以便分词更精准；采用分词效果更好的分词器，如pyltp、THULAC、Hanlp等；编写预处理类，就像下面要谈到的数字特征归一化，去掉文本中的#@￥%……&等等

In [2]:
import jieba
jieba.enable_parallel(56) #并行分词开启
data['文本分词'] = data['正文'].apply(lambda i:jieba.cut(i) )

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.633 seconds.
Prefix dict has been built successfully.


In [3]:
data['文本分词'] = [' '.join(i) for i in data['文本分词']]

In [5]:
data.head()

Unnamed: 0,分类,正文,文本分词
0,艺术,﻿【 文献号 】1-2432\n【原文出处】出版发行研究\n【原刊地名】京\n【原刊期号】1...,﻿ 【 文献号 】 1 - 2432 \n 【 原文 出处 】 出版发行 研究 \n...
1,艺术,﻿【 文献号 】1-2435\n【原文出处】扬州师院学报：社科版\n【原刊期号】199504...,﻿ 【 文献号 】 1 - 2435 \n 【 原文 出处 】 扬州 师院 学报 ：...
2,艺术,﻿【 文献号 】1-2785\n【原文出处】南通师专学报：社科版\n【原刊期号】199503...,﻿ 【 文献号 】 1 - 2785 \n 【 原文 出处 】 南通 师专 学报 ：...
3,艺术,﻿【 文献号 】1-3021\n【原文出处】社会科学战线\n【原刊地名】长春\n【原刊期号】...,﻿ 【 文献号 】 1 - 3021 \n 【 原文 出处 】 社会科学 战线 \n...
4,艺术,﻿【 文献号 】1-3062\n【原文出处】上海文化\n【原刊期号】199505\n【原刊页...,﻿ 【 文献号 】 1 - 3062 \n 【 原文 出处 】 上海 文化 \n 【...


In [14]:
# 这是一个典型的文本多分类问题 
# 针对该问题，采用kaggle上通用的Multi-Class Log-Loss作为评测指标
import numpy as np
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [4]:
# 用scikit-learn中的LabelEncoder将文本标签（Text Label）转化为数字(Integer)
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.分类.values)


In [5]:
# 将数据分成训练和验证集
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(data.文本分词.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True) # stratify: 为了保持split前类的分布

In [6]:
print (xtrain.shape)
print (xvalid.shape)

(8324,)
(925,)


## 模型构建

### TFIDF+逻辑回归

In [35]:
# 将scikit-learn中的TfidfVectorizer类稍稍改写下，以便将文本中的数字特征统一表示成"#NUMBER"，达到一定的降噪效果
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def number_normalizer(tokens):
    """ 将所有数字标记映射为一个占位符（Placeholder）。
    对于许多实际应用场景来说，以数字开头的tokens不是很有用，
    但这样tokens的存在也有一定相关性。 通过将所有数字都表示成同一个符号，可以达到降维的目的。
    """
    return ("#NUMBER" if token[0].isdigit() else token for token in tokens)


class NumberNormalizingVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
        return lambda doc: list(number_normalizer(tokenize(doc)))

In [7]:
stwlist=[line.strip() for line in open('data/chinese/停用词汇总.txt', 'r', encoding='utf-8').readlines()]

In [36]:
tfv = NumberNormalizingVectorizer(min_df=3,  
                                  max_df=0.5,
                                  max_features=None,                 
                                  ngram_range=(1, 2), 
                                  use_idf=True,
                                  smooth_idf=True,
                                  stop_words = stwlist) # max_df最大文档频率；ngram_range默认为(1, 1)，只有unigrams

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
# 下面的提示是由于使用了自定义的tokenizer，当停用词使用了自定义的tokenizer做检查时发现不一致就会出现下面的提示

  'stop_words.' % sorted(inconsistent))


In [13]:
#利用提取的TFIDF特征来fit一个简单的Logistic Regression 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print(classification_report(predictions, yvalid))

logloss: 0.607 


ValueError: Classification metrics can't handle a mix of continuous-multioutput and multiclass targets

### TF+逻辑回归

In [37]:
# 使用词汇计数（Word Counts）
ctv = CountVectorizer(min_df=3,
                      max_df=0.5,
                      ngram_range=(1,2),
                      stop_words = stwlist)

# 使用Count Vectorizer来fit训练集和测试集（半监督学习）
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

  'stop_words.' % sorted(inconsistent))


In [15]:
clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
print(classification_report(predictions, yvalid))

logloss: 0.769 


ValueError: Classification metrics can't handle a mix of continuous-multioutput and multiclass targets

### TFIDF+朴素贝叶斯

In [18]:
#利用提取的TFIDF特征来fitNaive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.841 


### TF+朴素贝叶斯

In [19]:
#利用提取的word counts特征来fitNaive Bayes
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 3.780 


### TFIDF+SVD+Stand+SVM

In [20]:
# 由于SVM需要花费大量时间，因此在应用SVM之前，我们将使用奇异值分解（Singular Value Decomposition ）来减少TF-IDF中的特征数量
# 同时，在使用SVM之前，我们还需要将数据标准化（Standardize Data ）
#使用SVD进行降维，components设为120，对于SVM来说，SVD的components的合适调整区间一般为120~200 
from sklearn.svm import SVC
from sklearn import decomposition

svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

#对从SVD获得的数据进行缩放
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [21]:
# 调用下SVM模型
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.344 


### TFIDF+xgboost

In [22]:
# 基于tf-idf特征，使用xgboost
import xgboost as xgb
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.179 


### TF+xgboost

In [23]:
# 基于word counts特征，使用xgboost
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.157 


### TFIDF+SVD+xgboost

In [24]:
# 基于tf-idf的svd特征，使用xgboost
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.392 


### TFIDF+SVD+Stand+xgboost

In [25]:
# 再对经过数据标准化(Scaling)的tf-idf-svd特征使用xgboost
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))


logloss: 0.428 


### 网格搜索+TFIDF+SVD+逻辑回归

In [26]:
# 网格搜索（Grid Search）: 一种超参数优化的技巧，你可以通过获取最优的参数组合来产生良好的文本分类效果
# 在开始网格搜索之前，我们需要创建一个评分函数，这可以通过scikit-learn的make_scorer函数完成的
from sklearn import metrics, pipeline
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [27]:
# 接下来，我们需要一个pipeline。 为了演示，我将使用由SVD（进行特征缩放）和逻辑回归模型组成的pipeline
#SVD初始化
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD()
    
# Standard Scaler初始化
scl = preprocessing.StandardScaler()

# 再一次使用Logistic Regression
lr_model = LogisticRegression()

# 创建pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])

In [28]:
# 接下来我们需要一个参数网格（A Grid of Parameters）：
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}

In [29]:
# 对于SVD，我们评估120和180个分量（Components），对于逻辑回归，我们评估三个不同的学习率C值，其中惩罚函数为l1和l2
# 网格搜索模型（Grid Search Model）初始化
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

#fit网格搜索模型
model.fit(xtrain_tfv, ytrain)  #为了减少计算量，这里我们仅使用xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 2 folds for each of 12 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   3 out of  24 | elapsed:  2.9min remaining: 20.5min
[Parallel(n_jobs=-1)]: Done   6 out of  24 | elapsed:  3.0min remaining:  9.1min
[Parallel(n_jobs=-1)]: Done   9 out of  24 | elapsed:  3.5min remaining:  5.9min
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:  4.0min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:  4.1min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:  4.4min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:  4.5min remaining:   38.6s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.3min finished
Best score: -0.367
Best parameters set:
	lr__C: 1.0
	lr__penalty: 'l1'
	svd__n_co

### 网格搜索+TFIDF+SVD+朴素贝叶斯

In [30]:
nb_model = MultinomialNB()

# 创建pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# 搜索参数设置
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# 网格搜索模型（Grid Search Model）初始化
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# fit网格搜索模型
model.fit(xtrain_tfv, ytrain)  # 为了减少计算量，这里我们仅使用xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    6.0s remaining:   17.9s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    6.7s remaining:    9.4s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    6.7s remaining:    4.8s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    6.7s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    6.8s finished
Best score: -0.783
Best parameters set:
	nb__alpha: 0.01


### 基于word2vec的词嵌入

In [8]:
X=data['文本分词']
X=[i.split() for i in X]

In [9]:
# 训练word2vec词向量:
import gensim

model = gensim.models.Word2Vec(X,min_count =5,window =8,vector_size=100)   # X是经分词后的文本构成的list，也就是tokens的列表的列表
embeddings_index = dict(zip(model.wv.index_to_key, model.wv.vectors))

print('Found %s word vectors.' % len(embeddings_index))

Found 119790 word vectors.


In [37]:
# Word2Vec还有3个值得关注的参数，iter是模型训练时迭代的次数，假如参与训练的文本量较少，就需要把这个参数调大一些；sg是模型训练算法的类别，1 代表 skip-gram，;0代表 CBOW;window控制窗口，它指当前词和预测词之间的最大距离，如果设得较小，那么模型学习到的是词汇间的功能性特征（词性相异），如果设置得较大，会学习到词汇之间的相似性特征（词性相同）的大小，假如语料够多，笔者一般会设置得大一些，8~10
embeddings_index['汽车']

array([-2.0753336 , -3.24562   , -1.7440916 , -1.0641257 ,  0.07990439,
       -1.6532873 ,  0.19504979, -0.45225587,  1.4039006 ,  0.3728696 ,
       -4.3212523 ,  1.2374814 , -2.4979463 , -0.34326115,  0.1271953 ,
       -1.9488493 , -1.8547392 , -1.519167  ,  0.02743584,  1.514501  ,
       -0.8076854 ,  2.236369  ,  1.798642  ,  1.1343637 ,  0.6369065 ,
       -1.9509858 ,  0.9435685 , -3.5797098 , -2.8106194 , -0.39179415,
        1.1827027 , -0.9123246 , -1.3226672 ,  1.6173434 ,  1.5838022 ,
       -0.8958759 ,  1.1502815 ,  0.14905936, -1.1554657 ,  0.94903773,
        2.5181758 ,  1.4764676 , -0.00658648, -1.2471029 , -1.0260888 ,
       -2.9987311 , -0.04199202,  2.9657953 ,  0.05197916, -1.8273337 ,
        2.2278676 , -0.6589001 , -3.083529  , -0.73008853,  2.4676669 ,
        1.7655442 , -0.00957161,  2.2353344 ,  0.3395955 , -0.8379713 ,
        4.40888   , -0.27573213,  1.2394832 ,  2.2964668 ,  0.97332007,
       -0.90037125, -0.42723814,  1.3714274 , -2.3780425 ,  1.92

In [10]:
# 该函数会将语句转化为一个标准化的向量（Normalized Vector）
import numpy as np
def sent2vec(s):
    import jieba
    # jieba.enable_parallel() #并行分词开启
    words = str(s).lower()
    words = jieba.lcut(words) # jieba.cut生成的是一个生成器，jieba.lcut 直接生成的就是一个list
    words = [w for w in words if not w in stwlist]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(100)
    return v / np.sqrt((v ** 2).sum())

In [11]:
# 对训练集和验证集使用上述函数，进行文本向量化处理
from tqdm import tqdm
xtrain_w2v = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_w2v = [sent2vec(x) for x in tqdm(xvalid)]

100%|██████████| 8324/8324 [2:18:37<00:00,  1.00it/s]
100%|██████████| 925/925 [14:59<00:00,  1.03it/s]


In [12]:
xtrain_w2v = np.array(xtrain_w2v)
xvalid_w2v = np.array(xvalid_w2v)

### word2vec+xgboost

In [15]:
# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
import xgboost as xgb
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_w2v, ytrain)
predictions = clf.predict_proba(xvalid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


logloss: 0.463 


In [16]:
# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_w2v, ytrain)
predictions = clf.predict_proba(xvalid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


logloss: 0.409 


### word2vec+Stand+FC

In [17]:
# 在使用神经网络前，对数据进行缩放
from sklearn import preprocessing
scl = preprocessing.StandardScaler()
xtrain_w2v_scl = scl.fit_transform(xtrain_w2v)
xvalid_w2v_scl = scl.transform(xvalid_w2v)

In [20]:
# 对标签进行binarize处理
from keras.utils import np_utils
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [21]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization

In [25]:
#创建1个3层的序列神经网络（Sequential Neural Net）
model = Sequential()

model.add(Dense(300, input_dim=100, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(19))
model.add(Activation('softmax'))

# 模型编译
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
model.fit(xtrain_w2v_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_w2v_scl, yvalid_enc))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f33830db310>

### word2vec+LSTM

In [27]:
# 使用 keras tokenizer
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#对文本序列进行zero填充
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [28]:
# 基于已有的数据集中的词汇创建一个词嵌入矩阵（Embedding Matrix）
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 381293/381293 [00:00<00:00, 847545.80it/s]


In [29]:
# 基于前面训练的Word2vec词向量，使用1个两层的LSTM模型
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f329828ae50>

### word2vec+LSTM+earlystopping

In [31]:
# 基于前面训练的Word2vec词向量，使用1个两层的LSTM模型
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

#在模型拟合时，使用early stopping这个回调函数（Callback Function）
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


<keras.callbacks.History at 0x7f32982ef110>

### word2vec+BiLSTM+earlystopping

In [32]:
# 基于前面训练的Word2vec词向量，构建1个2层的Bidirectional LSTM 
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

#在模型拟合时，使用early stopping这个回调函数（Callback Function）
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


<keras.callbacks.History at 0x7f3280496f90>

### word2vec+GRU+earlystopping

In [33]:
# 基于前面训练的Word2vec词向量，构建1个2层的GRU模型
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     100,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(100, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(19))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

#在模型拟合时，使用early stopping这个回调函数（Callback Function）
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


<keras.callbacks.History at 0x7f325050ec50>

### 模型集成（Model Ensembling）

In [38]:
#创建一个Ensembling主类，具体使用方法见下一个cell
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import os
import sys
import logging

logging.basicConfig(
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S", stream=sys.stdout)
logger = logging.getLogger(__name__)


class Ensembler(object):
    def __init__(self, model_dict, num_folds=3, task_type='classification', optimize=roc_auc_score,
                 lower_is_better=False, save_path=None):
        """
        Ensembler init function
        :param model_dict: 模型字典 
        :param num_folds: ensembling所用的fold数量
        :param task_type: 分类（classification） 还是回归（regression）
        :param optimize: 优化函数，比如 AUC, logloss, F1等，必须有2个函数，即y_test 和 y_pred
        :param lower_is_better: 优化函数（Optimization Function）的值越低越好还是越高越好
        :param save_path: 模型保存路径
        """

        self.model_dict = model_dict
        self.levels = len(self.model_dict)
        self.num_folds = num_folds
        self.task_type = task_type
        self.optimize = optimize
        self.lower_is_better = lower_is_better
        self.save_path = save_path

        self.training_data = None
        self.test_data = None
        self.y = None
        self.lbl_enc = None
        self.y_enc = None
        self.train_prediction_dict = None
        self.test_prediction_dict = None
        self.num_classes = None

    def fit(self, training_data, y, lentrain):
        """
        :param training_data: 二维表格形式的训练数据
        :param y: 二进制的, 多分类或回归
        :return: 用于预测的模型链（Chain of Models）

        """

        self.training_data = training_data
        self.y = y

        if self.task_type == 'classification':
            self.num_classes = len(np.unique(self.y))
            logger.info("Found %d classes", self.num_classes)
            self.lbl_enc = LabelEncoder()
            self.y_enc = self.lbl_enc.fit_transform(self.y)
            kf = StratifiedKFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, self.num_classes)
        else:
            self.num_classes = -1
            self.y_enc = self.y
            kf = KFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, 1)

        self.train_prediction_dict = {}
        for level in range(self.levels):
            self.train_prediction_dict[level] = np.zeros((train_prediction_shape[0],
                                                          train_prediction_shape[1] * len(self.model_dict[level])))

        for level in range(self.levels):

            if level == 0:
                temp_train = self.training_data
            else:
                temp_train = self.train_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):
                validation_scores = []
                foldnum = 1
                for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
                    logger.info("Training Level %d Fold # %d. Model # %d", level, foldnum, model_num)

                    if level != 0:
                        l_training_data = temp_train[train_index]
                        l_validation_data = temp_train[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    else:
                        l0_training_data = temp_train[0][model_num]
                        if type(l0_training_data) == list:
                            l_training_data = [x[train_index] for x in l0_training_data]
                            l_validation_data = [x[valid_index] for x in l0_training_data]
                        else:
                            l_training_data = l0_training_data[train_index]
                            l_validation_data = l0_training_data[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])

                    logger.info("Predicting Level %d. Fold # %d. Model # %d", level, foldnum, model_num)

                    if self.task_type == 'classification':
                        temp_train_predictions = model.predict_proba(l_validation_data)
                        self.train_prediction_dict[level][valid_index,
                        (model_num * self.num_classes):(model_num * self.num_classes) +
                                                       self.num_classes] = temp_train_predictions

                    else:
                        temp_train_predictions = model.predict(l_validation_data)
                        self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
                    validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
                    validation_scores.append(validation_score)
                    logger.info("Level %d. Fold # %d. Model # %d. Validation Score = %f", level, foldnum, model_num,
                                validation_score)
                    foldnum += 1
                avg_score = np.mean(validation_scores)
                std_score = np.std(validation_scores)
                logger.info("Level %d. Model # %d. Mean Score = %f. Std Dev = %f", level, model_num,
                            avg_score, std_score)

            logger.info("Saving predictions for level # %d", level)
            train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
            train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_" + str(level) + ".csv"),
                                        index=False, header=None)

        return self.train_prediction_dict

    def predict(self, test_data, lentest):
        self.test_data = test_data
        if self.task_type == 'classification':
            test_prediction_shape = (lentest, self.num_classes)
        else:
            test_prediction_shape = (lentest, 1)

        self.test_prediction_dict = {}
        for level in range(self.levels):
            self.test_prediction_dict[level] = np.zeros((test_prediction_shape[0],
                                                         test_prediction_shape[1] * len(self.model_dict[level])))
        self.test_data = test_data
        for level in range(self.levels):
            if level == 0:
                temp_train = self.training_data
                temp_test = self.test_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
                temp_test = self.test_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):

                logger.info("Training Fulldata Level %d. Model # %d", level, model_num)
                if level == 0:
                    model.fit(temp_train[0][model_num], self.y_enc)
                else:
                    model.fit(temp_train, self.y_enc)

                logger.info("Predicting Test Level %d. Model # %d", level, model_num)

                if self.task_type == 'classification':
                    if level == 0:
                        temp_test_predictions = model.predict_proba(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict_proba(temp_test)
                    self.test_prediction_dict[level][:, (model_num * self.num_classes): (model_num * self.num_classes) +
                                                                                        self.num_classes] = temp_test_predictions

                else:
                    if level == 0:
                        temp_test_predictions = model.predict(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict(temp_test)
                    self.test_prediction_dict[level][:, model_num] = temp_test_predictions

            test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
            test_predictions_df.to_csv(os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
                                       index=False, header=None)

        return self.test_prediction_dict

In [39]:
#为每个level的集成指定使用数据：
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
train_data_dict = {0: [xtrain_tfv, xtrain_ctv, xtrain_tfv, xtrain_ctv], 1: [xtrain_w2v]}
test_data_dict = {0: [xvalid_tfv, xvalid_ctv, xvalid_tfv, xvalid_ctv], 1: [xvalid_w2v]}

model_dict = {0: [LogisticRegression(), LogisticRegression(), MultinomialNB(alpha=0.1), MultinomialNB()],

              1: [xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)]}

ens = Ensembler(model_dict=model_dict, num_folds=3, task_type='classification',
                optimize=multiclass_logloss, lower_is_better=True, save_path='')

ens.fit(train_data_dict, ytrain, lentrain=xtrain_w2v.shape[0])
preds = ens.predict(test_data_dict, lentest=xvalid_w2v.shape[0])

[10:58:35] INFO Found 19 classes
[10:58:35] INFO Training Level 0 Fold # 1. Model # 0
[10:59:18] INFO Predicting Level 0. Fold # 1. Model # 0
[10:59:18] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.779958
[10:59:18] INFO Training Level 0 Fold # 2. Model # 0
[11:00:04] INFO Predicting Level 0. Fold # 2. Model # 0
[11:00:04] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.781615
[11:00:04] INFO Training Level 0 Fold # 3. Model # 0
[11:00:46] INFO Predicting Level 0. Fold # 3. Model # 0
[11:00:46] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.757153
[11:00:46] INFO Level 0. Model # 0. Mean Score = 0.772909. Std Dev = 0.011162
[11:00:46] INFO Training Level 0 Fold # 1. Model # 1
[11:05:23] INFO Predicting Level 0. Fold # 1. Model # 1
[11:05:23] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.504900
[11:05:23] INFO Training Level 0 Fold # 2. Model # 1
[11:09:49] INFO Predicting Level 0. Fold # 2. Model # 1
[11:09:49] INFO Level 0. Fold # 2. Model # 1. Va

In [40]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, preds[1]))

logloss: 0.298 
