In [8]:
#-*-coding:utf-8-*-
import sys
import pickle
import traceback
from pprint import pprint
import numpy as np
import pandas as pd
from gensim import corpora,models

from scipy.sparse import *
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop
from keras.layers import Activation

Using TensorFlow backend.


In [9]:
from config import *
import jieba

In [10]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

In [11]:
# 加载训练集合
df_train_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')

In [15]:
# 加载测试集
df_test_dataset = pd.read_csv('./data/Preliminary-texting-1.csv', encoding='utf-8')

In [16]:
df_test_dataset.shape

(102024, 2)

In [17]:
df_train_dataset = df_train_dataset[['COMMCONTENT', 'COMMLEVEL']]
df_test_dataset = df_test_dataset[['COMMCONTENT']]

In [18]:
df_train_dataset.shape[0]

20000

In [19]:
df_all_dataset = pd.concat([df_train_dataset, df_test_dataset], ignore_index=True)

In [21]:
df_all_dataset.tail()

Unnamed: 0,COMMCONTENT,COMMLEVEL
122019,主要是人太多了，太挤了,
122020,人少，温泉太旧，池子水咋的，有的没开,
122021,隐形收费，批东西贵得要死，坑人！,
122022,山不高，但是很累，3个小时爬上去，一个小时下山。感受了一下红色旅游景点。,
122023,占地不小，但没什么人气，从世园会结束后就没什么人去了,


In [75]:
MAX_SEQUENCE_LENGTH = 150 # 每篇文章选取150个词

MAX_NB_WORDS = 80000 # 将字典设置为含有1万个词84480

EMBEDDING_DIM = 300 # 词向量维度，300维

VALIDATION_SPLIT = 0.1 # 测试集大小，全部数据的20%

BATCH_SIZE = 128

In [23]:
COMMCONTENT_SEG = []

for sent in df_all_dataset['COMMCONTENT']:

    # Extract Sentence
    sent = str(sent).strip()

    seg_list = jieba.cut(sent, cut_all=False)

    seg_list = [i for i in seg_list if i != ' ']
    
    COMMCONTENT_SEG.append(" ".join(seg_list))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5n/2_by50851fxc4d_snc1d9wf80000gn/T/jieba.cache
Loading model cost 0.822 seconds.
Prefix dict has been built succesfully.


In [24]:
df_all_dataset['COMMCONTENT_SEG'] = pd.DataFrame(COMMCONTENT_SEG,columns=['COMMCONTENT_SEG'])

In [26]:
df_all_dataset['COMMCONTENT_SEG'].tail()

122019                                    主要 是 人太多 了 ， 太挤 了
122020                       人少 ， 温泉 太旧 ， 池子 水 咋 的 ， 有 的 没开
122021                            隐形 收费 ， 批 东西 贵得 要死 ， 坑人 ！
122022    山不高 ， 但是 很累 ， 3 个 小时 爬上去 ， 一个 小时 下山 。 感受 了 一下 ...
122023            占地 不小 ， 但 没什么 人气 ， 从 世园 会 结束 后 就 没什么 人去 了
Name: COMMCONTENT_SEG, dtype: object

In [27]:
text_corpus = df_all_dataset['COMMCONTENT_SEG']

In [28]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) # 传入我们词向量的字典

tokenizer.fit_on_texts(text_corpus) # 传入我们的训练数据，得到训练数据中出现的词的字典

In [29]:
dataset_sequences = tokenizer.texts_to_sequences(text_corpus) # 根据训练数据中出现的词的字典，将训练数据转换为sequences

In [30]:
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 100134 unique tokens.


In [31]:
padded_dataset_sequences = pad_sequences(dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH) # 限制每篇文章的长度

In [32]:
padded_dataset_sequences.shape

(122024, 150)

In [33]:
df_test_dataset_seg = df_all_dataset['COMMCONTENT_SEG'][20000:]

In [34]:
test_dataset_sequences = tokenizer.texts_to_sequences(df_test_dataset_seg)

In [35]:
padded_test_dataset_sequences = pad_sequences(test_dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [38]:
padded_test_dataset_sequences.shape

(102024, 150)

In [39]:
# label one hot表示
labels = df_all_dataset['COMMLEVEL'].dropna().map(int)#.values.tolist()
labels = to_categorical(labels-1) 

In [40]:
labels

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [41]:
print('Shape of data tensor:', padded_dataset_sequences.shape)
print('Shape of label tensor:', len(labels))

Shape of data tensor: (122024, 150)
Shape of label tensor: 20000


In [42]:
labels

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [43]:
len(padded_dataset_sequences[:df_train_dataset.shape[0]])

20000

In [44]:
padded_dataset_sequences[:df_train_dataset.shape[0]]

array([[    0,     0,     0, ...,   586,   295,   714],
       [    0,     0,     0, ...,  2558,   230,     7],
       [    0,     0,     0, ...,   332,    59,   742],
       ...,
       [    0,     0,     0, ...,     6, 61456,     3],
       [    0,     0,     0, ..., 43363,     6,     6],
       [    0,     0,     0, ...,   129,    65,  4692]], dtype=int32)

In [45]:
len(df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]])

20000

In [46]:
df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]]

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
5        1.0
6        1.0
7        1.0
8        1.0
9        1.0
10       1.0
11       1.0
12       1.0
13       1.0
14       1.0
15       1.0
16       1.0
17       1.0
18       1.0
19       1.0
20       1.0
21       1.0
22       1.0
23       1.0
24       1.0
25       1.0
26       1.0
27       1.0
28       1.0
29       1.0
        ... 
19970    3.0
19971    3.0
19972    3.0
19973    3.0
19974    3.0
19975    3.0
19976    3.0
19977    3.0
19978    3.0
19979    3.0
19980    3.0
19981    3.0
19982    3.0
19983    3.0
19984    3.0
19985    3.0
19986    3.0
19987    3.0
19988    3.0
19989    3.0
19990    3.0
19991    3.0
19992    3.0
19993    3.0
19994    3.0
19995    3.0
19996    3.0
19997    3.0
19998    3.0
19999    3.0
Name: COMMLEVEL, Length: 20000, dtype: float64

In [47]:
padded_dataset_sequences[:df_train_dataset.shape[0]]

array([[    0,     0,     0, ...,   586,   295,   714],
       [    0,     0,     0, ...,  2558,   230,     7],
       [    0,     0,     0, ...,   332,    59,   742],
       ...,
       [    0,     0,     0, ...,     6, 61456,     3],
       [    0,     0,     0, ..., 43363,     6,     6],
       [    0,     0,     0, ...,   129,    65,  4692]], dtype=int32)

In [76]:
train_X,valid_X,train_y,valid_y =train_test_split(padded_dataset_sequences[:df_train_dataset.shape[0]], 
                                                  df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]], 
                                                  test_size=0.1)

In [77]:
train_X.shape

(18000, 150)

In [78]:
len(valid_X)

2000

In [52]:
vocab,vocab_freqs = build_vocab(df_all_dataset['COMMCONTENT_SEG'])

In [53]:
len(vocab_freqs)

100793

In [54]:
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [55]:
len(word2index)

80002

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(df_dataset['COMMCONTENT_SEG'])

In [None]:
dataset_sequences = tokenizer.texts_to_sequences(df_dataset['COMMCONTENT_SEG'])

In [None]:
padded_dataset_sequences = pad_sequences(dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
padded_dataset_sequences[0]

In [56]:
print('Indexing word embeddings.')  
embeddings_index = {}
with open('./embeddings/sgns.weibo.word','r') as f:
    f = f.readlines()
    for i in f[:]:
        values = i.strip().split(' ')
#         print(values)
        word = str(values[0])
        embedding = np.asarray(values[1:],dtype='float')
        embeddings_index[word] = embedding
print('word embedding',len(embeddings_index))

Indexing word embeddings.
word embedding 195202


In [57]:
nb_words = min(MAX_NB_WORDS,len(word2index))
nb_words

80000

In [58]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

In [59]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [60]:
word_embedding_matrix[:10]

array([[ 0.27259 ,  0.244615,  0.032857, ..., -0.199684, -0.084092,
         0.060737],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.094386, -0.200944, -0.030828, ...,  0.003085,  0.023796,
        -0.201742],
       ...,
       [ 0.190794, -0.037967,  0.1013  , ..., -0.302136, -0.126407,
        -0.178464],
       [ 0.175443,  0.239842,  0.210521, ...,  0.071008,  0.177222,
        -0.062866],
       [-0.230501, -0.152982,  0.207998, ...,  0.007232, -0.494047,
        -0.179105]])

In [61]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential


from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM,SimpleRNN
from keras.layers import Activation

from keras.callbacks import Callback
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.models import load_model

from keras.utils.vis_utils import plot_model

from keras.utils.np_utils import to_categorical

array([1, 2, 3])

In [82]:
def text_cnn(maxlen=MAX_SEQUENCE_LENGTH, max_features=2000, embed_size=32):
    
    # Inputs
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # Embeddings layers
    emb_comment =  Embedding(input_dim = MAX_NB_WORDS+1, 
                             output_dim = EMBEDDING_DIM, 
#                              weights=[word_embedding_matrix], 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=False,
                             trainable=True
                            )(sequence_input)
        # Embeddings layers
    pre_emb_comment =  Embedding(input_dim = MAX_NB_WORDS+1, 
                             output_dim = EMBEDDING_DIM, 
                             weights=[word_embedding_matrix], 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=False,
                             trainable=True
                            )(sequence_input)

    # conv layers
    convs = []
    
    filter_sizes = [2, 3, 4, 5]
    
    for fsz in filter_sizes:
        l_conv = Conv1D(filters=100, kernel_size=fsz, activation='tanh')(pre_emb_comment)
        
        l_pool = MaxPooling1D(maxlen - fsz + 1)(l_conv)
        
        l_pool = Flatten()(l_pool)
        
        convs.append(l_pool)
    
    merge = concatenate(convs, axis=1)

    out = Dropout(0.25)(merge)
    
    output = Dense(256, activation='tanh')(out)

    output = Dense(len(np.unique(np.unique(df_all_dataset['COMMLEVEL'].dropna().map(int)))), activation='softmax')(output)

    # model = Model([sequence_input], output)
    # model = Model(inputs=sequence_input, output)
    model = Model(sequence_input, output)
    
    #  adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    #  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    # 优化器我这里用了adadelta，也可以使用其他方法
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [85]:
model = text_cnn()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 150, 300)     24000300    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 149, 100)     60100       embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 148, 100)     90100       embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_7 (

In [152]:
# train_X,valid_X,train_y,valid_y =train_test_split(padded_dataset_sequences, df_dataset['COMMLEVEL'], test_size=0.2)

In [64]:
train_X.shape,valid_X.shape

((16000, 150), (4000, 150))

In [65]:
train_y.shape,valid_y.shape

((16000,), (4000,))

In [185]:
train_X

array([[   0,    0,    0, ...,    1,    8, 1649],
       [   0,    0,    0, ...,   10,   12,  138],
       [   0,    0,    0, ...,   87,   53, 2362],
       ...,
       [   0,    0,    0, ...,   20,   64,   32],
       [   0,    0,    0, ...,   67,  171,    2],
       [   0,    0,    0, ..., 2046, 2046,    3]], dtype=int32)

In [83]:
import keras
#写一个LossHistory类，保存loss和acc
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = {'batch': [], 'epoch': []}
        self.accuracy = {'batch': [], 'epoch': []}
        self.val_loss = {'batch': [], 'epoch': []}
        self.val_acc = {'batch': [], 'epoch': []}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('acc'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_acc'))

    def on_epoch_end(self, batch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('acc'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_acc'))

    def loss_plot(self, loss_type):
        iters = range(len(self.losses[loss_type]))
        #创建一个图
        plt.figure()
        # acc
        plt.plot(iters, self.accuracy[loss_type], 'r', label='train acc')#plt.plot(x,y)，这个将数据画成曲线
        # loss
        plt.plot(iters, self.losses[loss_type], 'g', label='train loss')
        if loss_type == 'epoch':
            # val_acc
            plt.plot(iters, self.val_acc[loss_type], 'b', label='val acc')
            # val_loss
            plt.plot(iters, self.val_loss[loss_type], 'k', label='val loss')
        plt.grid(True)#设置网格形式
        plt.xlabel(loss_type)
        plt.ylabel('acc-loss')#给x，y轴加注释
        plt.legend(loc="upper right")#设置图例显示位置
        plt.show()

In [67]:
to_categorical(train_y.map(int)-1, num_classes=None)

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [86]:
# 0.1 valid data
batch_size = 256
epochs = 5

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=train_X, y=to_categorical(train_y.map(int)-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y.map(int)-1, num_classes=None)),
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=epochs,
                    verbose=1
         )

Train on 18000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1479e20f0>

In [87]:
all_test_preds2 = model.predict(padded_test_dataset_sequences, batch_size=256)
w2v4 = np.argmax(all_test_preds2,axis=1)[:]+1
len(w2v4)

102024

In [89]:
pd.Series(w2v4).value_counts(normalize=True)

1    0.385037
3    0.315788
2    0.299175
dtype: float64

In [97]:
pd.Series(np.argmax((all_test_preds2*0.7+all_test_preds*0.3),axis=1)+1).value_counts(normalize=True)

1    0.381322
3    0.310496
2    0.308182
dtype: float64

In [68]:
batch_size = 256
epochs = 5

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=train_X, y=to_categorical(train_y.map(int)-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y.map(int)-1, num_classes=None)),
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=epochs,
                    verbose=1
         )

Train on 16000 samples, validate on 4000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x147688048>

In [91]:
all_test_preds

array([[6.2206060e-01, 8.9762181e-02, 2.8817725e-01],
       [2.5641244e-02, 4.8368481e-01, 4.9067396e-01],
       [8.6228114e-01, 1.3433319e-02, 1.2428550e-01],
       ...,
       [1.8361310e-05, 9.3378540e-04, 9.9904782e-01],
       [3.2784998e-01, 3.5401651e-01, 3.1813359e-01],
       [3.0305763e-03, 7.9328582e-02, 9.1764081e-01]], dtype=float32)

In [69]:
all_test_preds = model.predict(padded_test_dataset_sequences, batch_size=256)
w2v3 = np.argmax(all_test_preds,axis=1)[:]+1

In [88]:
len(w2v3)

102024

In [72]:
pd.Series(w2v3).value_counts(normalize=True)

1    0.364316
2    0.339450
3    0.296234
dtype: float64

In [73]:
np.savetxt("all_testset_preds.txt", w2v3,fmt="%d")

In [74]:
with open('./all_testset_preds.txt', 'r') as f:
    lines = f.readlines()
    print(len(lines))

102024


In [110]:
batch_size = 256
epochs = 5

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=train_X, y=to_categorical(train_y.map(int)-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y.map(int)-1, num_classes=None)),
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=epochs,
                    verbose=1
         )

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13a2e0d30>

In [166]:
batch_size = 256
epochs = 10

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=train_X, y=to_categorical(train_y.map(int)-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y.map(int)-1, num_classes=None)),
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=8,
                    verbose=1
         )

Train on 16000 samples, validate on 4000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x13c9231d0>

In [94]:
history = LossHistory()
model.fit(x=padded_dataset_sequences[:df_train_dataset.shape[0]],
          y=labels, 
          batch_size=128, 
          epochs=10, 
          verbose=1, 
          callbacks=[history],
          validation_split=0.2, 
          shuffle=True
          )

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 1152/16000 [=>............................] - ETA: 2:33 - loss: 0.5806 - acc: 0.7674

KeyboardInterrupt: 

In [215]:
# with emb

In [245]:
batch_size = 256
epochs = 7

# model.fit(x_train, y_train,
#           validation_split=0.1,
#           batch_size=batch_size,
#           epochs=epochs,
#           shuffle=True)
#创建一个实例LossHistory
history = LossHistory()

model.fit(x=train_X, y=to_categorical(train_y.map(int)-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y.map(int)-1, num_classes=None)),
                    batch_size=batch_size, 
                    #callbacks=[checkpoint],
                    callbacks=[history],
                    epochs=epochs,
                    verbose=1
         )

Train on 16000 samples, validate on 4000 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1ec68c6a0>

In [247]:
score, acc = model.evaluate(valid_X[:],to_categorical(valid_y-1, num_classes=None)[:], batch_size=batch_size)
#     print(score, acc)
print('test_loss: %f, accuracy: %f' % (score, acc))

test_loss: 1.249573, accuracy: 0.649750


In [None]:
import matplotlib.pyplot as plt
history.loss_plot('epoch')

In [None]:
padded_dataset_sequences[df_train_dataset.shape[0]:]

In [138]:
# 对图像进行分类
preds = model.predict(padded_dataset_sequences[df_train_dataset.shape[0]:])

In [144]:
# 输出预测概率
print('Predicted:', preds.view())

Predicted: [[5.2982956e-01 2.2400606e-01 2.4616444e-01]
 [4.1301711e-03 2.1600150e-02 9.7426969e-01]
 [9.4749469e-01 4.7617290e-02 4.8880223e-03]
 ...
 [9.5940363e-01 3.4636229e-02 5.9601241e-03]
 [2.7177039e-01 5.9248245e-01 1.3574722e-01]
 [1.8757084e-04 5.3510573e-02 9.4630182e-01]]


In [139]:
len(preds)

65499

In [127]:
for i in preds:
    print(out_classes(i))

0.22400606


In [None]:
def out_classes(preds):
    if(preds[0])

In [145]:
result = np.argmax(preds,axis=1)+1

In [143]:
np.savetxt("result.txt", result,fmt="%d")

In [175]:
result[:20]

array([1, 3, 1, 2, 1, 1, 2, 2, 3, 2, 1, 2, 2, 2, 2, 1, 3, 2, 1, 2])

In [151]:
preds[:18]

array([[0.52982956, 0.22400606, 0.24616444],
       [0.00413017, 0.02160015, 0.9742697 ],
       [0.9474947 , 0.04761729, 0.00488802],
       [0.28808346, 0.6831014 , 0.02881507],
       [0.4397474 , 0.3284209 , 0.23183167],
       [0.88036036, 0.10871738, 0.01092222],
       [0.00354895, 0.88335264, 0.11309841],
       [0.3168921 , 0.4097239 , 0.273384  ],
       [0.00416835, 0.16871381, 0.8271178 ],
       [0.3726424 , 0.57038784, 0.0569697 ],
       [0.72266513, 0.21734984, 0.05998506],
       [0.22898637, 0.76364267, 0.00737093],
       [0.22595192, 0.7005057 , 0.07354242],
       [0.0170265 , 0.6762607 , 0.3067128 ],
       [0.09329221, 0.74562746, 0.16108032],
       [0.8487967 , 0.14513786, 0.00606542],
       [0.3381511 , 0.31342646, 0.3484225 ],
       [0.22215328, 0.58964825, 0.18819852]], dtype=float32)

In [167]:
# 对图像进行分类
preds = model.predict(padded_dataset_sequences[df_train_dataset.shape[0]:])

In [174]:
np.argmax(preds,axis=1)[:20]+1

array([2, 3, 1, 2, 3, 1, 2, 2, 3, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2])

In [None]:
array([1, 3, 1, 2, 1, 1, 2, 2, 3, 2, 1, 2, 2, 2, 2, 1, 3, 2, 1, 2])

In [179]:
r =np.argmax(preds,axis=1)[:]+1

In [180]:
r

array([2, 3, 1, ..., 1, 2, 3])

In [181]:
np.savetxt("result.txt", r,fmt="%d")

In [186]:
train_preds = model.predict(train_X[:])

In [240]:
dd =np.argmax(train_preds,axis=1)[:20]+1

In [242]:
pd.Series(dd).value_counts(normalize=True)

2    0.45
3    0.35
1    0.20
dtype: float64

In [189]:
train_y.map(int)

6992     1
13919    3
16497    3
13380    3
12713    2
14253    3
6115     1
11491    2
9424     2
19664    3
10442    2
9779     2
2847     1
9388     2
10746    2
5206     1
9625     2
14833    3
2829     1
16004    3
4770     1
18400    3
17905    3
12405    2
7289     2
1062     1
17155    3
4574     1
19107    3
11444    2
        ..
10464    2
17342    3
12516    2
5383     1
6357     1
10304    2
16256    3
2486     1
4933     1
11548    2
14971    3
11549    2
15682    3
12299    2
10772    2
241      1
16998    3
2557     1
5151     1
8596     2
8813     2
12807    2
10826    2
18594    3
19590    3
12876    2
13991    3
4811     1
1626     1
15442    3
Name: COMMLEVEL, Length: 16000, dtype: int64

In [199]:
test_preds = model.predict(padded_test_dataset_sequences, batch_size=256)

In [201]:
np.argmax(test_preds,axis=1)[:20]+1

array([2, 3, 1, 2, 3, 1, 2, 2, 3, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2])

In [217]:
test_preds_w2v = model.predict(padded_test_dataset_sequences, batch_size=256)

In [221]:
w2v = np.argmax(test_preds_w2v,axis=1)[:]+1

In [222]:
len(w2v)

65499

In [223]:
np.savetxt("result-w2v-true.txt", w2v,fmt="%d")

In [257]:
w2v2 = np.argmax(test_preds_w2v2,axis=1)[:]+1

In [248]:
test_preds_w2v2 = model.predict(padded_test_dataset_sequences, batch_size=256)
w2v2 = np.argmax(test_preds_w2v2,axis=1)[:]+1

In [249]:
np.savetxt("result-w2v2-true.txt", w2v2,fmt="%d")

In [263]:
len(test_preds_w2v2)

65499

In [289]:
test_preds_w2v2.shape

(65499, 3)

In [290]:
rr= np.argmax(test_preds_w2v2,axis=0)+1
rr.shape

(3,)

In [291]:
rr= np.argmax(test_preds_w2v2,axis=1)+1
rr.shape

(65499,)

In [269]:
test_preds_w2v2

array([[5.1859379e-01, 2.0351185e-02, 4.6105501e-01],
       [4.7960702e-01, 2.3151167e-02, 4.9724177e-01],
       [9.8482740e-01, 5.6926869e-03, 9.4800284e-03],
       ...,
       [9.9976319e-01, 2.3535574e-04, 1.4494467e-06],
       [7.4019539e-01, 2.5365427e-01, 6.1502913e-03],
       [2.6803804e-07, 4.1422775e-04, 9.9958557e-01]], dtype=float32)

In [227]:
df_pred =pd.read_csv("result-w2v-true.txt",'r', header=None)

In [237]:
df_pred[0].value_counts(normalize=True)

2    0.495916
3    0.288783
1    0.215301
Name: 0, dtype: float64

In [239]:
train_y.value_counts(normalize=True)

3.0    0.350125
1.0    0.349750
2.0    0.300125
Name: COMMLEVEL, dtype: float64

In [254]:
df_pred =pd.read_csv("result-w2v-true.txt",'r', header=None)
df_pred[0].value_counts(normalize=True)

2    0.495916
3    0.288783
1    0.215301
Name: 0, dtype: float64

In [253]:
df_pred =pd.read_csv("result-w2v2-true.txt",'r', header=None)
df_pred[0].value_counts(normalize=True)

1    0.454343
3    0.320035
2    0.225622
Name: 0, dtype: float64

In [252]:
df_pred =pd.read_csv("./submissions/textcnn/dsjyycxds_preliminary.txt",'r', header=None)
df_pred[0].value_counts(normalize=True)

2    0.412006
1    0.302188
3    0.285806
Name: 0, dtype: float64

In [279]:
import numpy as np
a = np.array([[1, 5, 5, 2],
              [9, 6, 2, 8],
              [3, 7, 9, 1]])
a

array([[1, 5, 5, 2],
       [9, 6, 2, 8],
       [3, 7, 9, 1]])

In [282]:
print(np.argmax(a[:2], axis=1))

[1 0]


In [1]:
!ls

[34mBDCI2017-MingLue-master.zip.download[m[m  lr_char_ngram.csv
Emb+MLP.ipynb                         lr_char_ngram.pkl
Embedding+MLP.ipynb                   lr_word_ngram.csv
[34mHierarchical-Attention-Network-master[m[m lr_word_ngram.pkl
Notebook.ipynb                        lstm-text-classifier.ipynb
One-Hot+MLP.ipynb                     main.py
README.md                             [34mmodels[m[m
__init__.py                           requirements.txt
[34m__pycache__[m[m                           result-0.6468.txt
char-ngram-bag-of-words-0.67.txt      result-w2v-true.txt
[34mcheckpoints[m[m                           result-w2v2-true.txt
cnn_text_classifier.ipynb             [34msubmissions[m[m
config.py                             textcnn_model.png
[34mdata[m[m                                  tfidf_text_classifier.ipynb
dataset_inspur 01-45-27-966.csv       [34mtutorial[m[m
dataset_inspur.csv                    [34mutils[m[m
[34membedding

In [7]:
!tail ./data/Preliminary-texting.csv

165490,雨天游蓬莱，人很多，景色没有想象的好，
165491,周末去的人不多，10点一开门就去了，给闺女办了护照25元送了一个小挎包，里面有地图护照还有50元迷币，午餐有38、48的套餐，母女两人差不多够吃。4.5小时体验了8个职业，基本不用排队，小朋友很喜欢吵着还要再来。可能是是地下室，空气流通不好，我待了5个小时出来头疼。临出门墙上挂着孩子的照片，25元一张，本来不想要，无奈看着她的萌照不忍心不要啊，可能大部分家长都是这个心理吧。一大一下200多的门票，没有大人可玩的项目，就是一个陪同票，觉得有点贵了。没带充电宝，给孩子拍了好多照片，最后没电了，服务中心也没有相应的服务，要是大人的服务再做细点就好了，还有出门的照片要是送给小朋友们岂不是更好。一张照片的成本也就1-2元，门票都那么高了，还要说25元，真是有点圈钱了。 查看全部
165492,最喜欢这的水，太清凉了，还有这里啤酒是直接放到水沟里面的，，
165493,特别差。各种导游只是为了赚钱。跟孔子不沾半毛钱关系。尤其是路上的马车特别坑
165494,坐8号线森林公园南门站下直接到。里面有小山有湖有湿地，有专门的跑步道，
165495,性价比不高，国人参观国门还收这么高门票。
165496,要想真正体验青海湖之美，不论是骑车还是自驾，一定要好好沿着湖走，找到自己真正的心仪之地，停下来，才是青海湖的魅力所在。强烈建议不要去二郎剑，没什么可看的，票价天高，人满为患，根本到不了湖边，就一处乱石滩还全是垃圾。推荐去黑马河（最好晚上住一天），这里是离湖边最近的镇子，步行到湖边只需半个小时，人很少，还不要票，水面清澈。湖边会有一些当地人在卖手工艺品，也有牵着马和牦牛给你合影的，但因为不是旅游景点，他们都很和气，不会强买强卖。
165497,值得一去，栈道很有意思，坐索道上下的，但是中途还是要爬，觉得累
165498,主要还是青岛稍微有些冷，要不更好看
165499,真不好玩。很无聊的一个地方。人也不多。
