In [1]:
# classfier for automatic evaluation(sentiment\category)
# notice length condition doesn't need it 

In [1]:
#!-*- encoding=utf-8 -*-
from keras.models import Sequential, Model
from keras.layers import *
from keras.optimizers import *
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, Callback
import keras.backend as K
from keras import metrics

import numpy as np
import random
import sys
import os
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

config = tf.ConfigProto()
config.gpu_options.allow_growth=True  
sess = tf.Session(config=config)

# hyper-parameters
max_len = 15 
max_vocab = 10000

batch_size = 256

emb_size = 100
cnn_filter = 400
cnn_kernel = 3

pad_token = 0
oov_token = 1
start_token = 2
end_token = 3

# train and val dataset
# choose the dataset by your need
train_pos_path = '../dataset/yelp.train.1'
train_neg_path = '../dataset/yelp.train.0'
val_pos_path = '../dataset/yelp.dev.1'
val_neg_path = '../dataset/yelp.dev.0'


train = []
val = []

with open(train_pos_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip().lower().split(' ')
        train.append([line, 1])
        
with open(train_neg_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip().lower().split(' ')
        train.append([line, 0])
        
with open(val_pos_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip().lower().split(' ') 
        val.append([line, 1])
        
with open(val_neg_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip().lower().split(' ') 
        val.append([line, 0])

print('sequences:', len(train))
sys.stdout.flush()

# load vocab file
if os.path.exists('yelp-vocab.json'):
    chars,id2char,char2id = json.load(open('yelp-vocab.json'))
    id2char = {int(i):j for i,j in id2char.items()}
    
print('vocab size:', len(char2id))
sys.stdout.flush()

def str2id(s, start_end = False):
    ids = [char2id.get(c, oov_token) for c in s]
    if start_end:
        ids = [start_token] + ids + [end_token]
  
    return ids

def padding(x):
    ml = max_len
    x = [i + [0] * (ml-len(i)) for i in x]
    x = np.array(x)
    return x

def train_generator(data):
    x = []
    y = []
    
    while True:
        np.random.shuffle(data)    
        for d in data:
            text = d[0]
            label = d[1]
            text = str2id(text, start_end=False)
            
            x.append(text)
            y.append(label)

            if len(x) == batch_size:
                x = padding(x)
                yield [np.array(x),np.array(y)], None
                x = []
                y = []   

def get_batch_num(data):
    return len(data)//batch_size

train_bs_num = get_batch_num(train)
val_bs_num = get_batch_num(val)

train_gen = train_generator(train)
val_gen = train_generator(val)

print(train_bs_num, val_bs_num)
sys.stdout.flush()
    

Using TensorFlow backend.
  return f(*args, **kwds)


sequences: 444101
vocab size: 8904
1734 247


In [2]:
# Model architecture of classifier
x_in = Input(shape=(max_len, ))
x = x_in
x = Embedding(len(char2id), emb_size)(x)
x = Conv1D(cnn_filter, cnn_kernel, padding='valid', activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x_out = Dense(1, activation='sigmoid')(x)
cls = Model(x_in ,x_out)
cls.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 15, 100)           890400    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 13, 400)           120400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 1,011,201
Trainable params: 1,011,201
Non-trainable params: 0
_________________________________________________________________


In [3]:
# define a Model to train the classifier
y_true = Input(shape=(1, ))
x_in = Input(shape=(max_len, ))
y_pred = cls(x_in)
ce_loss = K.mean(K.binary_crossentropy(y_true, y_pred))
cls_train = Model([x_in, y_true], y_pred)
cls_train.add_loss(ce_loss)

acc = metrics.binary_accuracy(y_true, y_pred)
acc = K.sum(acc)/batch_size
cls_train.compile(optimizer=Adam(5e-4, 0.5))

cls_train.metrics_names.append('acc')
cls_train.metrics_tensors.append(acc)
cls_train.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 15)                0         
_________________________________________________________________
model_1 (Model)              (None, 1)                 1011201   
Total params: 1,011,201
Trainable params: 1,011,201
Non-trainable params: 0
_________________________________________________________________


  # Remove the CWD from sys.path while we load stuff.


In [4]:
# train classifier
iters_per_sample = train_bs_num
total_iter = train_bs_num * 10

best_val = 100000.0
best_result = []

for i in range(total_iter):
    text, label = next(train_gen)[0]
    K.set_value(cls_train.optimizer.lr, 1e-4)
    loss = cls_train.train_on_batch(
        [text, label], None)
  
    if i % 50 == 0:
        print ('iter: %s, loss: %s' % (i, loss))
        sys.stdout.flush()
        
    if i % train_bs_num == 0 :
        val_loss = cls_train.evaluate_generator(val_gen, steps=val_bs_num)
        print('val loss,', val_loss)
        sys.stdout.flush()
        if val_loss[0] <= best_val:
            best_val = val_loss[0]
            print('saving weights with best val:', val_loss)
            sys.stdout.flush()
            cls.save_weights('pretrain/yelp/cls.h5')


iter: 0, loss: [0.6959983, 0.3984375]
val loss, [0.694147700964198, 0.4563986589068826]
saving weights with best val: [0.694147700964198, 0.4563986589068826]
iter: 50, loss: [0.6654447, 0.56640625]
iter: 100, loss: [0.64249057, 0.57421875]
iter: 150, loss: [0.5790658, 0.7265625]
iter: 200, loss: [0.4871399, 0.82421875]
iter: 250, loss: [0.40324253, 0.87109375]
iter: 300, loss: [0.35704044, 0.86328125]
iter: 350, loss: [0.24630098, 0.9296875]
iter: 400, loss: [0.22908537, 0.921875]
iter: 450, loss: [0.21124913, 0.90625]
iter: 500, loss: [0.17588261, 0.953125]
iter: 550, loss: [0.168956, 0.94140625]
iter: 600, loss: [0.13130724, 0.95703125]
iter: 650, loss: [0.12646672, 0.953125]
iter: 700, loss: [0.11043687, 0.96875]
iter: 750, loss: [0.16374522, 0.9453125]
iter: 800, loss: [0.15667908, 0.94140625]
iter: 850, loss: [0.10772397, 0.96875]
iter: 900, loss: [0.15970168, 0.94140625]
iter: 950, loss: [0.108915284, 0.96875]
iter: 1000, loss: [0.09980579, 0.9765625]
iter: 1050, loss: [0.0972269

iter: 8850, loss: [0.0551171, 0.98046875]
iter: 8900, loss: [0.017857458, 1.0]
iter: 8950, loss: [0.05244284, 0.98046875]
iter: 9000, loss: [0.061896898, 0.98828125]
iter: 9050, loss: [0.029807704, 0.98828125]
iter: 9100, loss: [0.03438906, 0.99609375]
iter: 9150, loss: [0.042761177, 0.984375]
iter: 9200, loss: [0.043924175, 0.98046875]
iter: 9250, loss: [0.13694458, 0.953125]
iter: 9300, loss: [0.037539743, 0.9921875]
iter: 9350, loss: [0.062118966, 0.98046875]
iter: 9400, loss: [0.030923488, 0.98828125]
iter: 9450, loss: [0.047887776, 0.9765625]
iter: 9500, loss: [0.058277555, 0.9765625]
iter: 9550, loss: [0.04712882, 0.98828125]
iter: 9600, loss: [0.083666965, 0.9765625]
iter: 9650, loss: [0.09757673, 0.98046875]
iter: 9700, loss: [0.06122382, 0.98828125]
iter: 9750, loss: [0.086851284, 0.96875]
iter: 9800, loss: [0.061638728, 0.98046875]
iter: 9850, loss: [0.032365676, 0.98828125]
iter: 9900, loss: [0.058945823, 0.984375]
iter: 9950, loss: [0.034353685, 0.98828125]
iter: 10000, los