In [1]:
import sys
import pickle
import traceback
from pprint import pprint
import numpy as np
import pandas as pd
from gensim import corpora,models

from scipy.sparse import *
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import keras
from keras.models import Model
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint, LearningRateScheduler
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop
from keras.layers import Activation

import jieba
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

Using TensorFlow backend.


In [23]:
# 加载训练集合
df_train_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')
# 加载测试集
df_test_dataset = pd.read_csv('./data/Preliminary-texting.csv', encoding='utf-8')
df_train_dataset = df_train_dataset[['COMMCONTENT', 'COMMLEVEL']]
df_test_dataset = df_test_dataset[['COMMCONTENT']]
df_train_dataset.shape[0]

20000

In [24]:
df_all_dataset = pd.concat([df_train_dataset, df_test_dataset], ignore_index=True)

In [80]:
df_all_dataset.tail()

Unnamed: 0,COMMCONTENT,COMMLEVEL,COMMCONTENT_SEG
85494,性价比不高，国人参观国门还收这么高门票。,,
85495,要想真正体验青海湖之美，不论是骑车还是自驾，一定要好好沿着湖走，找到自己真正的心仪之地，停下...,,
85496,值得一去，栈道很有意思，坐索道上下的，但是中途还是要爬，觉得累,,
85497,主要还是青岛稍微有些冷，要不更好看,,
85498,真不好玩。很无聊的一个地方。人也不多。,,


In [83]:
COMMCONTENT_SEG = []

for sent in df_all_dataset['COMMCONTENT']:

    # Extract Sentence
    sent = str(sent).strip()

    # 去除标点符号会导致 Accuracy 降低
    # sent = clean_str(sent)

    seg_list = jieba.cut(sent, cut_all=False)

    seg_list = [i for i in seg_list if i!=' ']
    
    COMMCONTENT_SEG.append(" ".join(seg_list))

In [84]:
df_all_dataset['COMMCONTENT_SEG'] = pd.DataFrame(COMMCONTENT_SEG,columns=['COMMCONTENT_SEG'])

In [85]:
df_all_dataset['COMMCONTENT_SEG'].tail()

85494                       性价比 不高 ， 国人 参观 国门 还收 这么 高 门票 。
85495    要 想 真正 体验 青海湖 之美 ， 不论是 骑车 还是 自驾 ， 一定 要 好好 沿着 湖...
85496    值得 一去 ， 栈道 很 有意思 ， 坐 索道 上下 的 ， 但是 中途 还是 要 爬 ， ...
85497                          主要 还是 青岛 稍微 有些 冷 ， 要 不 更好 看
85498                     真 不好玩 。 很 无聊 的 一个 地方 。 人 也 不 多 。
Name: COMMCONTENT_SEG, dtype: object

In [86]:
text_corpus = df_all_dataset['COMMCONTENT_SEG']

In [87]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) # 传入我们词向量的字典

tokenizer.fit_on_texts(text_corpus.map(str)) # 传入我们的训练数据，得到训练数据中出现的词的字典

In [88]:
dataset_sequences = tokenizer.texts_to_sequences(text_corpus.map(str)) # 根据训练数据中出现的词的字典，将训练数据转换为sequences

In [89]:
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 83948 unique tokens.


In [90]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_SEQUENCE_LENGTH = 100

MAX_NUM_WORDS = 33950

MAX_NB_WORDS = 30000

EMBEDDING_DIM = 300

VALIDATION_SPLIT = 0.2

BATCH_SIZE = 32

In [91]:
vocab,vocab_freqs = build_vocab(df_dataset['COMMCONTENT_SEG'])

In [92]:
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [8]:
word_index

NameError: name 'word_index' is not defined

In [93]:
padded_dataset_sequences = pad_sequences(dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH) # 限制每篇文章的长度

In [94]:
padded_dataset_sequences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  545,  129,   23,  141,   32,    4,  194, 1270,  573,
          1,  613,    1,  109,  363,   69, 4407,    1,  231,  193,   31,
        108,    4,    1,  728,   38,  473,  253,  450, 6739,  608,  302,
        716], dtype=int32)

In [95]:
# label one hot表示
labels = df_all_dataset['COMMLEVEL'].dropna().map(int)#.values.tolist()
labels = to_categorical(labels-1) 

In [40]:
print('Indexing word embeddings.')  
embeddings_index = {}
with open('./embeddings/sgns.weibo.word','r') as f:
    f = f.readlines()
    for i in f[:]:
        values = i.strip().split(' ')
#         print(values)
        word = str(values[0])
        embedding = np.asarray(values[1:],dtype='float')
        embeddings_index[word] = embedding
print('word embedding',len(embeddings_index))

Indexing word embeddings.
word embedding 195202


In [96]:
nb_words = min(MAX_NB_WORDS,len(word2index))
nb_words

30000

In [97]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

In [98]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [99]:
word_embedding_matrix[:10]

array([[ 0.27259 ,  0.244615,  0.032857, ..., -0.199684, -0.084092,
         0.060737],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.094386, -0.200944, -0.030828, ...,  0.003085,  0.023796,
        -0.201742],
       ...,
       [ 0.190794, -0.037967,  0.1013  , ..., -0.302136, -0.126407,
        -0.178464],
       [-0.230501, -0.152982,  0.207998, ...,  0.007232, -0.494047,
        -0.179105],
       [ 0.175443,  0.239842,  0.210521, ...,  0.071008,  0.177222,
        -0.062866]])

In [11]:
# dataset_sequences = tokenizer.texts_to_matrix(list(df_dataset['COMMCONTENT_SEG']), mode='binary')

In [12]:
# train_X,valid_X,train_y,valid_y =train_test_split(dataset_sequences, df_dataset['COMMLEVEL'], test_size=0.2)

In [100]:
train_X,valid_X,train_y,valid_y =train_test_split(padded_dataset_sequences[:df_train_dataset.shape[0]], 
                                                  df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]], 
                                                  test_size=0.2)

In [101]:
train_X.shape

(16000, 100)

In [47]:
train_X

array([[   0,    0,    0, ...,    3,   80,    4],
       [   0,    0,    0, ..., 2386,   40,  802],
       [   0,    0,    0, ...,   11,   92,  607],
       ...,
       [   0,    0,    0, ...,    4,    4,    4],
       [   0,    0,    0, ..., 3136,   38,    4],
       [   0,    0,    0, ..., 3615, 7980,    7]], dtype=int32)

In [None]:
# keras.preprocessing.text.one_hot(text, len(vocab), filters=',!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ', split=' ')

In [16]:
for i in valid_X[1]:
    if(i!=0):
        print(i)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [48]:
print('Found %s unique tokens.' % len(word_index))

Found 33864 unique tokens.


In [49]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.data_utils import get_file
from keras import backend as K

In [108]:
DROPOUT=0.25

In [109]:
MAX_SEQUENCE_LENGTH

100

In [124]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,))

# Embeddings layers
seq =  Embedding(input_dim = MAX_NB_WORDS+1, 
                         output_dim = EMBEDDING_DIM, 
                         weights=[word_embedding_matrix], 
                         input_length=MAX_SEQUENCE_LENGTH, 
                         trainable=True
                        )(sequence_input)

seq = TimeDistributed(Dense(EMBEDDING_DIM, activation='tanh'))(seq)

seq = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(seq)

seq = Dense(128, activation='tanh')(seq)

seq = Dropout(DROPOUT)(seq)

seq = BatchNormalization()(seq)

out = Dense(3, activation='softmax')(seq)

model = Model(inputs=sequence_input, outputs=out)

In [125]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 100, 300)          9000300   
_________________________________________________________________
time_distributed_6 (TimeDist (None, 100, 300)          90300     
_________________________________________________________________
lambda_6 (Lambda)            (None, 300)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 128)               38528     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_normalization_6 (Batch (None, 128)               512       
__________

In [126]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [127]:
batch_size=256
epochs = 10

In [128]:
model.fit(x=train_X, y=to_categorical(train_y-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=batch_size, 
                    epochs=epochs,
                    verbose=1
         )

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x124eec240>

In [123]:
model.fit(x=train_X, y=to_categorical(train_y-1, num_classes=None), 
                    validation_data=(valid_X, to_categorical(valid_y-1, num_classes=None)[:]), 
                    batch_size=batch_size, 
                    epochs=epochs,
                    verbose=1
         )

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x125b23908>

In [40]:
train_X.shape

(16000, 30000)

In [75]:
df_test_dataset_seg = df_all_dataset['COMMCONTENT_SEG'][20000:]
test_dataset_sequences = tokenizer.texts_to_sequences(df_test_dataset_seg.map(str))

In [76]:
padded_test_dataset_sequences = pad_sequences(test_dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [77]:
padded_test_dataset_sequences

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       ...,
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2]], dtype=int32)

In [102]:
preds = model.predict(padded_dataset_sequences[df_train_dataset.shape[0]:])

In [103]:
padded_test_dataset_sequences

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       ...,
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 2]], dtype=int32)

In [104]:
result = np.argmax(preds,axis=1)+1

In [105]:
result

array([2, 1, 2, ..., 2, 3, 2])

In [107]:
pd.Series(result).v14alue_counts(normalize=True)

1    0.399685
2    0.327974
3    0.272340
dtype: float64

In [117]:
preds = model.predict(padded_dataset_sequences[df_train_dataset.shape[0]:])

In [118]:
result = np.argmax(preds,axis=1)+1
pd.Series(result).value_counts(normalize=True)

2    0.360723
1    0.322280
3    0.316997
dtype: float64