Sentiment analysis on [z17176 dataset](https://github.com/z17176/Chinese_conversation_sentiment).

This dataset was used in the following research.  They have built a 3m corpus for the research but only released the 30k dataset.

* [1]L. Zhang and C. Chen, “Sentiment Classification with Convolutional Neural Networks: An Experimental Study on a Large-Scale Chinese Conversation Corpus,” in 2016 12th International Conference on Computational Intelligence and Security (CIS), 2016, pp. 165–169. http://ieeexplore.ieee.org/abstract/document/7820437/

In [1]:
path = "data/conversation_sentiment"

In [2]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import os, math, re, pickle
#import jieba
from keras.models import Model, Sequential
from keras.layers import Embedding, Dense, Flatten, Conv1D, MaxPooling1D, BatchNormalization, Dropout

#jieba.set_dictionary("data/dict.txt.big")

Using TensorFlow backend.
  return f(*args, **kwds)


# Setup

In [3]:
_train = None
_valid = None

def load_train_valid():
    global _train, _valid
    if _train is None:
        _train = pd.read_csv(os.path.join(path, "sentiment_XS_30k.txt"))
    if _valid is None:
        _valid = pd.read_csv(os.path.join(path, "sentiment_XS_test.txt"))
    return _train, _valid

Load word embedding dictionary.

In [4]:
dictionary_path = os.path.join(path, "dictionary.pkl")

def create_dictionary(*data):
    phrases = {}
    for d in data:
        for sentence in d:
            for ph in sentence.split(" "):
                phrases[ph] = True
    with open(os.path.join(path, "dictionary.txt"), "w") as fh:
        fh.writelines([ ph + "\n" for ph in phrases.keys() ])
    !cd $path; ../../../bin/fasttext print-word-vectors models/wiki.zh.bin < dictionary.txt > dictionary.vec
    dictionary = pd.read_csv(os.path.join(path, "dictionary.vec"), 
                             delim_whitespace=True, engine="python", header=None, index_col=0)
    with open(dictionary_path, "wb") as fh:
        pickle.dump([{ ph: i for i, ph in enumerate(dictionary.index) }, dictionary], fh)

def load_dictionary():
    with open(dictionary_path, "rb") as fh:
        [ dict_index, dictionary ] = pickle.load(fh)
        return dict_index, dictionary
    
if not os.path.exists(dictionary_path):
    train, valid = load_train_valid()
    create_dictionary(train.text, valid.text)

dict_index, dictionary = load_dictionary()
phrases_n = len(dictionary)
latent_n = len(dictionary.columns)

Encode lables and embed phrases.

In [5]:
# phrase-length (min, max, mean, std) = (1, 23, 4.7941782325330093, 2.0175720386692686)
input_length = 8

data_path = os.path.join(path, "data.pkl")

if not os.path.exists(data_path):
    def get_label(df):
        labels = df["labels"].values
        labels[labels == "positive"] = 1
        labels[labels == "negative"] = 0
        return labels

    def get_text(df):
        texts = np.zeros((len(df), input_length))
        for i, text in enumerate(df.text.values):
            for j, ph in enumerate(text.split(" ")[:input_length]):
                if ph in dict_index:
                    texts[i, j] = dict_index[ph]
        return texts
    
    train, valid = load_train_valid()
    train_x, train_y = get_text(train), get_label(train)
    valid_x, valid_y = get_text(valid), get_label(valid)
    
    with open(data_path, "wb") as fh:
        pickle.dump([(train_x, train_y), (valid_x, valid_y)], fh)
else:
    with open(data_path, "rb") as fh:
        [(train_x, train_y), (valid_x, valid_y)] = pickle.load(fh)

# Simple CNN

In [18]:
def simple_cnn_model():
    model = Sequential()
    model.add(Embedding(phrases_n, latent_n, input_length=input_length, weights=[dictionary]))
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 3, border_mode="same", activation="relu"))
    model.add(Dropout(0.2))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(100, activation="relu"))
    model.add(Dropout(0.7))
    model.add(Dense(1, activation="sigmoid"))
    return model
    
simple_cnn = simple_cnn_model()
simple_cnn.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])
simple_cnn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 8, 300)        6644400     embedding_input_3[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 8, 300)        0           embedding_3[0][0]                
____________________________________________________________________________________________________
convolution1d_3 (Convolution1D)  (None, 8, 64)         57664       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 8, 64)         0           convolution1d_3[0][0]            
___________________________________________________________________________________________

In [19]:
def train_simple_cnn(lr=None, epoch=1):
    if lr is not None:
        simple_cnn.optimizer.lr = lr
    simple_cnn.fit(train_x, train_y, nb_epoch=epoch, validation_data=(valid_x, valid_y))
    
train_simple_cnn(1e-4)
train_simple_cnn(1e-1, 2)
train_simple_cnn(1e-2, 8)
train_simple_cnn(1e-3, 8)

Train on 29613 samples, validate on 11562 samples
Epoch 1/1
Train on 29613 samples, validate on 11562 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
 4480/29613 [===>..........................] - ETA: 106s - loss: 0.6893 - acc: 0.5339

KeyboardInterrupt: 

# Evaluation

In [8]:
model = simple_cnn

In [9]:
r = model.predict(valid_x)

In [15]:
r[:, 0], valid_y

(array([ 0.54696983,  0.54946762,  0.53444225, ...,  0.55052435,
         0.54280049,  0.54720104], dtype=float32),
 array([0, 0, 0, ..., 1, 1, 1], dtype=object))