In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential


from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM,SimpleRNN
from keras.layers import Activation

from keras.callbacks import Callback
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.models import load_model

from keras.utils.vis_utils import plot_model

from keras.utils.np_utils import to_categorical

import jieba
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

In [None]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_SEQUENCE_LENGTH = 80

MAX_NUM_WORDS = 33950

MAX_NB_WORDS = 20000

EMBEDDING_DIM = 300

VALIDATION_SPLIT = 0.2

BATCH_SIZE = 32

In [None]:
df_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')

In [None]:
COMMCONTENT_SEG = []

for sent in df_dataset['COMMCONTENT']:

    # Extract Sentence
    sent = str(sent).strip()

#     sent = clean_str(sent)

#     stopwords = [" ","!","...................................................................."]

    seg_list = jieba.cut(sent, cut_all=False)

    seg_list = [i for i in seg_list if i != ' ']
    
    COMMCONTENT_SEG.append(" ".join(seg_list))
df_dataset['COMMCONTENT_SEG'] = pd.DataFrame(COMMCONTENT_SEG,columns=['COMMCONTENT_SEG'])
df_dataset = df_dataset[df_dataset['COMMCONTENT_SEG']!=""]
df_dataset = df_dataset.reset_index()

In [None]:
vocab,vocab_freqs = build_vocab(df_dataset['COMMCONTENT_SEG'])

In [None]:
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(df_dataset['COMMCONTENT_SEG'])

In [None]:
dataset_sequences = tokenizer.texts_to_sequences(df_dataset['COMMCONTENT_SEG'])

In [None]:
padded_dataset_sequences = pad_sequences(dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
padded_dataset_sequences[0]

In [None]:
nb_words = min(MAX_NB_WORDS,len(word2index))
nb_words

In [None]:
print('Indexing word embeddings.')  
embeddings_index = {}
with open('./embeddings/sgns.weibo.word','r') as f:
    f = f.readlines()
    for i in f[:]:
        values = i.strip().split(' ')
#         print(values)
        word = str(values[0])
        embedding = np.asarray(values[1:],dtype='float')
        embeddings_index[word] = embedding
print('word embedding',len(embeddings_index))

In [None]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

In [None]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_X,valid_X,train_y,valid_y =train_test_split(padded_dataset_sequences, df_dataset['COMMLEVEL'], test_size=0.2)

In [None]:
pre_embedding_layer =  Embedding(input_dim = nb_words+1, 
                             output_dim = EMBEDDING_DIM, 
                            weights=[word_embedding_matrix], 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=True,
                             trainable=False
                            )

In [None]:
embedding_layer =  Embedding(input_dim = nb_words+1, 
                             output_dim = EMBEDDING_DIM, 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=True,
                             trainable=True
                            )

In [None]:
print('Build model...')
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer()
model = Sequential()
model.add(pre_embedding_layer)
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))  # try using a GRU instead, for fun
model.add(Dense(3))
model.add(Activation('tanh'))
model.add(Dense(len(np.unique(valid_y)), activation='softmax'))

In [None]:
model.summary()

In [None]:
# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
batch_size = 256
epochs = 10

model.fit(x=train_X, y=to_categorical(train_y-1, num_classes=3), 
                    validation_data=(valid_X, to_categorical(valid_y-1, num_classes=3)[:]), 
                    batch_size=batch_size, 
                    epochs=epochs,
                    verbose=1
         )

In [None]:
del model

In [1]:
valid_y

NameError: name 'valid_y' is not defined