In [1]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "12"
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input,  CuDNNLSTM, Embedding, Dropout, Activation, Conv1D, CuDNNGRU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from sklearn.metrics import roc_auc_score
from tensorflow.keras.callbacks import Callback     

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
columns = train.columns.tolist()[2:]

In [4]:
from keras.utils.np_utils import to_categorical
from typing import List
def get_y(df:pd.DataFrame, cols:List[str]=columns) -> List[np.array]:
    y_list = []
    for col in cols:
        y = df[col].values + 2
        y_ = to_categorical(y, num_classes=4)
        y_list.append(y_)
    return y_list

In [5]:
y_train = get_y(train)

In [6]:
y_val = get_y(val)

In [7]:
max_features=50000
maxlen=200
embed_size=300

In [8]:
tok=Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=pad_sequences(X_train,maxlen=maxlen)
x_test=pad_sequences(X_test,maxlen=maxlen)

In [9]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector        

In [11]:
from  tensorflow.keras.optimizers import Adam, RMSprop
from  tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from  tensorflow.keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "../ckpt/best_model_bigru_cnn_2.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 10)
reduce_plateau = ReduceLROnPlateau(factor=0.5, patience=2, verbose = 1)

In [13]:
def build_model_0(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0, classes=20):
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
    y = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    y = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    
    avg_pool2 = GlobalAveragePooling1D()(y)
    max_pool2 = GlobalMaxPooling1D()(y)
    
    
    x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    ys = []
    for i in range(classes):
        y = Dense(4, activation = "softmax")(x)
        ys.append(y)
    model = Model(inputs = inp, outputs = ys)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(x_train, y_train, batch_size = 256, epochs = 100,validation_split=0.05 , 
                        verbose = 1, callbacks = [check_point, early_stop, reduce_plateau])
    model = load_model(file_path)
    return model
# 从卷基层开始不共享
def build_model_2(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.4, classes=20):
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    y = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    ys = []
    for i in range(classes):
        x2 = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)

        y2 = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)

        avg_pool1 = GlobalAveragePooling1D()(x2)
        max_pool1 = GlobalMaxPooling1D()(x2)

        avg_pool2 = GlobalAveragePooling1D()(y2)
        max_pool2 = GlobalMaxPooling1D()(y2)


        x3 = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
        y3 = Dense(4, activation = "softmax")(x3)
        ys.append(y3)
    model = Model(inputs = inp, outputs = ys)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(x_train, y_train, batch_size = 256, epochs = 100,validation_split=0.05 , 
                        verbose = 1, callbacks = [check_point, early_stop, reduce_plateau])
    model = load_model(file_path)
    return model


In [14]:
model = build_model_2(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)

Train on 99750 samples, validate on 5250 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 13.84357, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 13.84357 to 11.86504, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 11.86504 to 11.06667, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 4/100



Epoch 00004: val_loss improved from 11.06667 to 10.73674, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 10.73674 to 10.52870, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 10.52870 to 10.43381, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 10.43381 to 10.28558, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 8/100



Epoch 00008: val_loss improved from 10.28558 to 10.25694, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 9/100

Epoch 00009: val_loss improved from 10.25694 to 10.20689, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 10/100

Epoch 00010: val_loss improved from 10.20689 to 10.20222, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 11/100

Epoch 00011: val_loss improved from 10.20222 to 10.17509, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 12/100



Epoch 00012: val_loss did not improve from 10.17509
Epoch 13/100

Epoch 00013: val_loss did not improve from 10.17509

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 14/100

Epoch 00014: val_loss improved from 10.17509 to 10.14208, saving model to ../ckpt/best_model_bigru_cnn_2.hdf5
Epoch 15/100

Epoch 00015: val_loss did not improve from 10.14208
Epoch 16/100



Epoch 00016: val_loss did not improve from 10.14208

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 17/100

Epoch 00017: val_loss did not improve from 10.14208
Epoch 18/100

Epoch 00018: val_loss did not improve from 10.14208

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 19/100

Epoch 00019: val_loss did not improve from 10.14208
Epoch 20/100



Epoch 00020: val_loss did not improve from 10.14208

Epoch 00020: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 21/100

Epoch 00021: val_loss did not improve from 10.14208
Epoch 22/100

Epoch 00022: val_loss did not improve from 10.14208

Epoch 00022: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 23/100

Epoch 00023: val_loss did not improve from 10.14208
Epoch 24/100



Epoch 00024: val_loss did not improve from 10.14208

Epoch 00024: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.


In [16]:
x_val = tok.texts_to_sequences(x_val)
x_val =pad_sequences(x_val,maxlen=maxlen)
test_pred = model.predict(x_test)
val_pred = model.predict(x_val)

In [17]:
from sklearn.metrics import f1_score
f1_list = []
for pred, true in zip(val_pred, y_val):
    F1 = f1_score(np.argmax(pred, axis=1), np.argmax(true, axis=1),average='macro')
    print(F1)
    f1_list.append(F1)
    

0.5153724755221121
0.4320045948252489
0.5753894453335282
0.48944740648228197
0.6753658018472426
0.5935173098040541
0.5982643124291033
0.6418858567136693
0.5437121954143447
0.5164455002615214
0.5871087837156324
0.6275878989205224
0.6274261156695615
0.6322647808780173
0.5953753005399044
0.6395808698064487
0.3957022440424496
0.6585192386249579
0.5435483732746309
0.6634370845725781


  'recall', 'true', average, warn_for)


In [18]:
np.mean(f1_list)

0.5775977794338905

In [19]:
def load_data_from_csv(file_name, header=0, encoding="utf-8"):

    data_df = pd.read_csv(file_name, header=header, encoding=encoding)

    return data_df

In [20]:
test = load_data_from_csv("../inputs/sentiment_analysis_testa.csv")
for pred, column in zip(test_pred, columns):
    test[column] = np.argmax(pred, axis=1) - 2

In [21]:
test

Unnamed: 0,id,content,location_traffic_convenience,location_distance_from_business_district,location_easy_to_find,service_wait_time,service_waiters_attitude,service_parking_convenience,service_serving_speed,price_level,...,environment_decoration,environment_noise,environment_space,environment_cleaness,dish_portion,dish_taste,dish_look,dish_recommendation,others_overall_experience,others_willing_to_consume_again
0,0,"""我想说他们家的优惠活动好持久啊，我预售的时候买的券，前两天心血来潮去吃的活动还在继续\n首...",-2,-2,-2,-2,1,-2,-2,0,...,-2,-2,-2,1,1,1,-2,-2,1,1
1,1,"""终于开到心心念念的LAB loft。第一次来就随便点也一些～【香辣虾意面】蛮辣的，但其实一...",-2,-2,-2,-2,-2,-2,-2,0,...,-2,-2,-2,-2,-1,1,-2,-2,1,1
2,2,"""地理位置好，交通方便，就在124车站对面交通方便，很好，我晚上7点多去买的了，已经没有什么...",1,1,1,-2,-1,-2,-2,0,...,-2,-2,-2,-2,-2,1,-2,-2,0,-2
3,3,"""运气很好，抽中了大众点评的霸王餐。这家主题餐厅心仪已久了，种种原因一直未能成行，没想到抽中...",1,1,-2,-2,1,-2,-2,-2,...,1,-2,-2,-2,-2,1,-2,1,1,1
4,4,"""幸运随点评团体验霸王餐，心情好~蜀九香刚进驻泉州不久，招牌大名气响，以至于刚到店门口的我被...",-2,-2,-2,-2,1,-2,-2,-2,...,1,-2,-2,-2,-2,1,-2,-2,1,-2
5,5,"""尽管韩国烤肉店在无锡已经有很多家了，但因为味道好吃再加上在无锡很有人气，依旧挡不住新店的开...",-2,-2,-2,-2,1,-2,-2,-1,...,-2,-2,-2,-2,0,1,-2,1,1,-2
6,6,"""店铺在乙烯生活二区的西北角，旁边是国旅的旗舰店，门口就是红绿灯，挺好找的。因为这是两磅一的...",-2,-2,1,-2,1,-2,-2,1,...,-2,-2,-2,-2,1,1,-2,-2,1,-2
7,7,"""朋友聚会来滴，这个地方真心不错哇，又能撸串，又能唱歌，还能看现场歌手演唱。最主要是可以上台...",-2,-2,-2,-2,-2,-2,-2,-2,...,1,1,-2,-2,-2,1,-2,-2,1,1
8,8,"""超喜欢这家的面食，经常来吃。这家风格及产品都很像京城御面，有时会怀疑是不是同一家的。面条比...",-2,-2,-2,-2,-2,-2,-2,0,...,-2,-2,-2,-2,1,0,-2,-2,0,-2
9,9,"""广西阳朔，前一天晚上吃的特色啤酒鱼，就看到了隔壁有家螺蛳粉，第二天过来吃。个人比较喜欢粉，...",-2,-2,1,-2,-2,1,1,0,...,-2,-2,-2,1,-2,0,-2,-2,1,-2


In [22]:
test.to_csv("../output/bigru-cnn-pooling3.csv", encoding="utf_8_sig", index=False)