In [27]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib import learn
import re
import pandas as pd
from tensorflow.contrib.layers import fully_connected,batch_norm
from keras.utils import to_categorical

In [28]:
def preprocess_data(string):
    
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()



In [29]:
def list_convert(dataframe):
    
    list_of_lists = []
    
    for _,row in dataframe.iterrows():
        
        list_of_lists.append(str(row["Phrase"]))
        
    return list_of_lists


def load_data_and_labels(train_path,test_path):
    
    train_data = pd.read_table(train_path,sep="\t")
    test_data = pd.read_table(test_path,sep="\t")
    
    train_sentences = list_convert(train_data.drop_duplicates(subset="SentenceId"))
    #test_sentences = list_convert(test_data.drop_duplicates(subset="SentenceId"))
    
    corpus_sentences = train_sentences #+ test_sentences
    
    corpus_sentences = list([preprocess_data(sentence.strip()) for sentence in corpus_sentences])
    
    y_true = to_categorical(np.array(train_data['Sentiment']))
    
    return [corpus_sentences,y_true]
    
    
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [30]:

corpus_sentences,y_true = load_data_and_labels("./Sentiment_Analysis_Challenge/train.tsv","./Sentiment_Analysis_Challenge/test.tsv")

max_document_length = max([len(text_string.split()) for text_string in corpus_sentences])

vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)

x_train = np.array(list(vocab_processor.fit_transform(corpus_sentences)))


In [39]:
# class SentimentAnalysis():
#     
#     def __init__(self,embedding_dim,vocab_size,sequence_length):
#         
#         self.input_x = tf.placeholder(tf.int32,shape=[None,sequence_length])
#         
#         with tf.device("/cpu:0"),tf.name_scope("embedding"):
#             
#             self.W = tf.Variable(tf.random_uniform(shape=[vocab_size,embedding_dim],minval=-1.0,maxval=1.0))
#             self.embedded_chars = tf.nn.embedding_lookup(self.W,self.input_x)
#             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

tf.reset_default_graph()

embedding_dim = 300
batch_size = 32
num_classes = 5
n_cell = 100
reshape_size = 100
n_layers = 2
hidden_1 = 50
learning_rate = 0.01
num_epochs = 10

vocab_size = len(vocab_processor.vocabulary_)
print("VOCAB SIZE : " + str(vocab_size))
sequence_length = max_document_length


#input_x = tf.placeholder(tf.int32,shape=[None,sequence_length])


X_batch = tf.placeholder(tf.int32,shape=[None,sequence_length],name="x_batch")
y_batch = tf.placeholder(tf.int64,shape=[None,num_classes],name="y_true")

with tf.device("/cpu:0"),tf.name_scope("embedding"):
    
    word_vec = tf.Variable(tf.random_uniform(shape=[vocab_size,embedding_dim],minval=-1.0,maxval=1.0),trainable=True)
    embedded_chars = tf.nn.embedding_lookup(word_vec,X_batch)
    #embedded_chars_expanded = tf.expand_dims(embedded_chars,-1)


lstm_cell = tf.contrib.rnn.OutputProjectionWrapper(tf.contrib.rnn.LSTMCell(num_units = n_cell),output_size = reshape_size)
output,states = tf.nn.dynamic_rnn(lstm_cell,embedded_chars,dtype=tf.float32)

print(output)

#reshaped_output = tf.reshape(output,shape=[-1,n_cell])


######## Fully connected layer ##########

is_training = tf.placeholder(tf.bool,name="is_training")


batch_params = {"is_training" : is_training,
                "decay":0.99,
                "updates_collections": None}

with tf.contrib.framework.arg_scope([fully_connected],normalizer_fn=batch_norm,normalizer_params=batch_params,activation_fn=tf.nn.relu,
                                    weights_initializer=tf.variance_scaling_initializer(mode="fan_avg")):
        
    hidden1 = fully_connected(output,hidden_1)
    logits = fully_connected(hidden1,num_classes)


with tf.name_scope("loss"):
    
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=y_batch,name="cross_entropy")
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_step = optimizer.minimize(cross_entropy)


with tf.name_scope("accuracy_measure"):
    
    correct = tf.equal(tf.argmax(logits,1),tf.argmax(y_batch,1))
    accuracy = tf.reduce_mean(tf.cast(correct,tf.float32),name="accuracy")


saver = tf.train.Saver()
init = tf.global_variables_initializer()


with tf.Session() as sess:
    
    init.run()    
    batches = batch_iter(list(zip(x_train,y_true)),batch_size,num_epochs)
    
    batch_iteration = 1
    
    #sess.run([embedded_chars],feed_dict={input_x : x_train})
    
    for batch in batches:
        
        x_batch , y_true = zip(*batch)
        sess.run([train_step],feed_dict={X_batch:x_batch,y_batch:y_true,is_training:True})
        accuracy = sess.run([accuracy],feed_dict={X_batch:x_batch,y_batch:y_true,is_training:False})
        
        print("BATCH : " + str(batch_iteration) + ", ACCURACY : " + str(accuracy))
        batch_iter+=1

VOCAB SIZE : 15214
Tensor("rnn/transpose_1:0", shape=(?, 52, 100), dtype=float32)


InvalidArgumentError: logits and labels must be same size: logits_size=[1664,5] labels_size=[32,5]
	 [[Node: loss/cross_entropy = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](loss/cross_entropy/Reshape, loss/cross_entropy/Reshape_1)]]

Caused by op 'loss/cross_entropy', defined at:
  File "/usr/local/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-9be49c76ebbd>", line 69, in <module>
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=y_batch,name="cross_entropy")
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/util/deprecation.py", line 136, in new_func
    return func(*args, **kwargs)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 1885, in softmax_cross_entropy_with_logits
    labels=labels, logits=logits, dim=dim, name=name)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 1804, in softmax_cross_entropy_with_logits_v2
    precise_logits, labels, name=name)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 4624, in _softmax_cross_entropy_with_logits
    name=name)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3160, in create_op
    op_def=op_def)
  File "/home/piyush/PycharmProjects/Kaggle_Competitions/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): logits and labels must be same size: logits_size=[1664,5] labels_size=[32,5]
	 [[Node: loss/cross_entropy = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](loss/cross_entropy/Reshape, loss/cross_entropy/Reshape_1)]]
