# Data training

In this python, we input the id-list data into an RNN and a LSTM model, train the model and calculate the accuracy

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras as kr
from sklearn import preprocessing
import tensorflow as tf
import re

Using TensorFlow backend.


In [2]:
# open and read the id-list
df = pd.read_csv('code.csv')
print(df)

                                              text_code      sent
0     [1069    2    1 6227    7  159    0  125    5 ...  negative
1     [   2   82   27    8   63    0 1654   14  278 ...  negative
2     [   0    9    0 3954    0   16 1175   15    0 ...  negative
3     [ 300    0    2    0   17    9    0 4333  103 ...  negative
4     [   0    0   63    0   52    0    0    6 5692 ...   neutral
...                                                 ...       ...
5995  [   2    0   75    0   66    0 3415    0  182 ...  positive
5996  [ 270   65   91  281    0    0  182   20    0 ...   neutral
5997  [  44 1062    0 2784  308  104  280    0    0 ...   neutral
5998  [  911    58     0   415     7    11  6207    ...  positive
5999  [4377  484    0    0    0  681    0    0  182 ...   neutral

[6000 rows x 2 columns]


the id-list is a currently in String type, we have to change it into a numeric array 

In [3]:
# type-change function
def textcode_to_array(textcode):
    listOfTokens = re.split(r'\W+',textcode)
    # decrease the length of array since there are spaces at the start and end of the splitted list
    codes = np.zeros(len(listOfTokens)-2, dtype=np.int) 
    for i in range(len(codes)):
        codes[i] = int(listOfTokens[i+1])
    return codes

In [4]:
# do the type change
df['code'] = ""
for i in range(df.shape[0]):
    df.iloc[i,2] =  textcode_to_array(df.iloc[i,0])
print(df)

                                              text_code      sent  \
0     [1069    2    1 6227    7  159    0  125    5 ...  negative   
1     [   2   82   27    8   63    0 1654   14  278 ...  negative   
2     [   0    9    0 3954    0   16 1175   15    0 ...  negative   
3     [ 300    0    2    0   17    9    0 4333  103 ...  negative   
4     [   0    0   63    0   52    0    0    6 5692 ...   neutral   
...                                                 ...       ...   
5995  [   2    0   75    0   66    0 3415    0  182 ...  positive   
5996  [ 270   65   91  281    0    0  182   20    0 ...   neutral   
5997  [  44 1062    0 2784  308  104  280    0    0 ...   neutral   
5998  [  911    58     0   415     7    11  6207    ...  positive   
5999  [4377  484    0    0    0  681    0    0  182 ...   neutral   

                                                   code  
0     [1069, 2, 1, 6227, 7, 159, 0, 125, 5, 23, 0, 1...  
1     [2, 82, 27, 8, 63, 0, 1654, 14, 278, 0, 990, 0...

Further processing of data

In [5]:
num_classes = 3 # positive, negative, neutral
# use labelencoder to change the type of classes from String to integer
le = preprocessing.LabelEncoder() 
labels = le.fit_transform(df.iloc[:,1])
# and then, change the type from integer to one-hot code
labels = kr.utils.to_categorical(labels, num_classes) 
print(labels)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [6]:
# divide the data into train, validation and test sets in a ratio 6:2:2
X_trainval,X_test,y_trainval,y_test=train_test_split(df.iloc[:,2],labels,test_size=0.2,random_state=3) 
X_train,X_val,y_train,y_val=train_test_split(X_trainval,y_trainval,test_size=0.25,random_state=3) 

In [7]:
# find the max length of id-lists
maxlen = 0;
for i in range(6000):
    length = len(df.iloc[i,2])
    if length>maxlen:
        maxlen = length
print(maxlen)

53


We have processed the data, now we should prepare our model

In [8]:
num_units = 64 # number of cells in a hidden layer
batch_size=10 # size of batch
n_batch=360 # we have 3600 train data, so 3600/10 = 360 batches
train_rate = 0.001 # training rate of optimizer
embedding_size = 32 # dimension of embedding set
vocabulary_size = 10509 # length of dictionary

# placeholders for input data and output probabilities
x=tf.placeholder(tf.int32,shape=[None,maxlen]) #
y=tf.placeholder(tf.float32,[None,num_classes])

In [9]:
# weights and biases which transform the last output of RNN model into a num_classes dimensional probability vector
weights=tf.Variable(tf.truncated_normal([num_units,num_classes],stddev=0.1))
biases=tf.Variable(tf.constant(0.1,shape=[num_classes]))

In [10]:
# generate an embedding matrix for the dictionary
embedding = tf.get_variable('embedding', [vocabulary_size, embedding_size])

In [11]:
# keep probability for dropout process
keep_prob = tf.placeholder(tf.float32)

Define and train our model

In [12]:
# Simple RNN model
'''
def RNN(x,weights,bias):
    embedding_inputs = tf.nn.embedding_lookup(embedding, x) # replace the integers in id list into embedding vectors
    basic_cell=tf.nn.rnn_cell.BasicRNNCell(num_units) # generate a basic RNN model with num_units cells in each layer
    
    outputs, states = tf.nn.dynamic_rnn(basic_cell,embedding_inputs, dtype=tf.float32) # run the model and get the last output
    outputs = tf.nn.dropout(outputs, keep_prob) # dropout process
    result = tf.nn.softmax(tf.matmul(outputs[:,-1,:],weights)+bias,1) # calculate the probability vector
    return result

prediction=RNN(x,weights,biases) # get the probability vector
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y)) # calculate the cost
train_step=tf.train.AdamOptimizer(train_rate).minimize(cost) # use optimizer to improve the model in a given rate
correct_predict=tf.equal(tf.argmax(y,1),tf.argmax(prediction,1)) # generate accuracy vector
accuracy=tf.reduce_mean(tf.cast(correct_predict,tf.float32)) # calculate accuracy

init=tf.global_variables_initializer() # initialize all variables

# run the training function
with tf.Session() as sess:
    sess.run(init)
    for i in range(n_batch):
        TrainData_label=y_train[i*batch_size:(i+1)*n_batch]
        TrainData_batch = kr.preprocessing.sequence.pad_sequences(X_train.iloc[i*batch_size:(i+1)*n_batch], maxlen) # increase the sequence length to maxlen
        sess.run(train_step,feed_dict={x:TrainData_batch,y:TrainData_label, keep_prob: 0.75})
        print("batch: ")
        print(i)
    print("train accuracy: ")
    TrainData = kr.preprocessing.sequence.pad_sequences(X_train, maxlen)
    print(sess.run(accuracy, feed_dict={x: TrainData, y: y_train, keep_prob: 1})) # Prediction does not need dropout, so 1 is set
    print("val accuracy: ")
    ValData = kr.preprocessing.sequence.pad_sequences(X_val, maxlen)
    print(sess.run(accuracy, feed_dict={x: ValData, y: y_val, keep_prob: 1}))
'''

'\ndef RNN(x,weights,bias):\n    embedding_inputs = tf.nn.embedding_lookup(embedding, x) # replace the integers in id list into embedding vectors\n    basic_cell=tf.nn.rnn_cell.BasicRNNCell(num_units) # generate a basic RNN model with num_units cells in each layer\n    \n    outputs, states = tf.nn.dynamic_rnn(basic_cell,embedding_inputs, dtype=tf.float32) # run the model and get the last output\n    outputs = tf.nn.dropout(outputs, keep_prob) # dropout process\n    result = tf.nn.softmax(tf.matmul(outputs[:,-1,:],weights)+bias,1) # calculate the probability vector\n    return result\n\nprediction=RNN(x,weights,biases) # get the probability vector\ncost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y)) # calculate the cost\ntrain_step=tf.train.AdamOptimizer(train_rate).minimize(cost) # use optimizer to improve the model in a given rate\ncorrect_predict=tf.equal(tf.argmax(y,1),tf.argmax(prediction,1)) # generate accuracy vector\naccuracy=tf.reduce_mean(

In [13]:
# LSTM model
def LSTM(x,weights,bias):
    embedding_inputs = tf.nn.embedding_lookup(embedding, x) # replace the integers in id list into embedding vectors
    lstm_cell=tf.contrib.rnn.BasicLSTMCell(num_units) # generate a LSTM model with num_units cells in each layer
    
    outputs, states = tf.nn.dynamic_rnn(lstm_cell,embedding_inputs, dtype=tf.float32) # run the model and get the last output
    outputs = tf.nn.dropout(outputs, keep_prob)     # dropout process
    result = tf.nn.softmax(tf.matmul(outputs[:,-1,:],weights)+bias,1)  # calculate the probability vector

    return result

prediction=LSTM(x,weights,biases) # get the probability vector
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))  # calculate the cost
train_step=tf.train.AdamOptimizer(train_rate).minimize(cost) # use optimizer to improve the model in a given rate
correct_predict=tf.equal(tf.argmax(y,1),tf.argmax(prediction,1)) # generate accuracy vector
accuracy=tf.reduce_mean(tf.cast(correct_predict,tf.float32)) # calculate accuracy

init=tf.global_variables_initializer() # initialize all variables

# run the training function
with tf.Session() as sess:
    sess.run(init)
    for i in range(n_batch):
        TrainData_label=y_train[i*batch_size:(i+1)*n_batch]
        TrainData_batch = kr.preprocessing.sequence.pad_sequences(X_train.iloc[i*batch_size:(i+1)*n_batch], maxlen) # increase the sequence length to maxlen
        sess.run(train_step,feed_dict={x:TrainData_batch,y:TrainData_label, keep_prob: 0.75})
        print("batch: ")
        print(i)
    print("train accuracy: ")
    TrainData = kr.preprocessing.sequence.pad_sequences(X_train, maxlen)
    print(sess.run(accuracy, feed_dict={x: TrainData, y: y_train, keep_prob: 1})) # Prediction does not need dropout, so 1 is set
    print("val accuracy: ")
    ValData = kr.preprocessing.sequence.pad_sequences(X_val, maxlen)
    print(sess.run(accuracy, feed_dict={x: ValData, y: y_val, keep_prob: 1}))
    print("test accuracy: ")
    TestData = kr.preprocessing.sequence.pad_sequences(X_test, maxlen)
    print(sess.run(accuracy, feed_dict={x: TestData, y: y_test, keep_prob: 1}))

Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

batch: 
0
batch: 
1
batch: 
2
batch: 
3
batch: 
4
batch: 
5
batch: 
6
batch: 
7
batch: 
8
batch: 
9
batch: 
10
batch: 
11
batch: 
12
batch: 
13
batch: 
14
batch: 
15
batch: 
16
batch: 
17
batch: 
18
batch: 
19
batch: 
20
batch: 
21
batch: 
22
batch: 
23
batch: 
24
batch: 
25
batch: 
26
batch: 
27
batch: 
28
batch: 
29
batch: 
30
batch: 
31
batch: 
32
batch: 
33
batch: 
34
batch: 
35
batch: 
36
batch: 
37
batch: 
38
batch: 
39
batch: 
40
batch: 
41
batch: 
42
batch: 
43
batch: 
44
batch: 
45
batch: 
46
batch: 
47
batch: 
48
batch: 
49
batch: 
50
batch: 
51
batch: 
52
batch: 
53
batch: 
54
batch: 
55
batch: 
56
batch: 
57
batch: 
58
batch: 
59
batch: 
60
batch: 
61
batch: 
62
batch: 
63
batch: 
64
batch: 
65
batch: 
66
batch: 
67
