# Sentiment Analysis usando Deep Learning para español en textos cortos

# 1. Data Cleaning 

We work on a dataset of tweets labelled as positive or negative, more information about the dataset 
can be found on http://www.sepln.org/workshops/tass/. 

The dataset that we used can download here:

- http://www.sepln.org/workshops/tass/2017/#datasets
- http://www.sepln.org/workshops/tass/2018/#datasets

Some description about this datasets:

1. ----
2. ----
3. ----

# 2. Vocabulary

## 2.2 Getting train, validation and test subsets

In [1]:
import lib.xmlreader as xml
import lib.utils as ut
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import gensim.models.word2vec



In [2]:
train_docs= xml.readXML("../database/TASS/TASS2017/task1-Training.xml")
test_docs= xml.readXML("../database/TASS/TASS2017/task1-Development.xml")

In [3]:
train_tweets = []
train_labels = []
for doc in train_docs:
    # train_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    train_tweets.append(doc.content)
    train_labels.append(doc.polarity)

test_tweets = []
test_labels = []
for doc in test_docs:
    # test_tweets.append(ut.tokenize(doc.content, 0)['clean'])
    test_tweets.append(doc.content)
    test_labels.append(doc.polarity)

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = (pd.Series(train_tweets), pd.Series(test_tweets), pd.Series(train_labels), pd.Series(test_labels))

In [5]:
fmt1_ = "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive"
print(fmt1_.format(len(x_train),
      (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
      (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))

fmt3_ = "Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive"
print(fmt3_.format(len(x_test),
      (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
      (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 736 entries with 56.79% negative, 43.21% positive
Test set has total 375 entries with 58.40% negative, 41.60% positive


# 3. Preparing Data

In [6]:
from gensim.models import KeyedVectors

def gensim_load_vec(path="../database/embeddings/SBW-vectors-300-min5.bin"):
    #use gensim_emb.wv.index2word if used this way to load vectors
    #gensim_emb = gensim.models.word2vec.Word2Vec.load(path)
    gensim_emb =  gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    vocab = gensim_emb.index2word
    vec = gensim_emb.syn0
    shape = gensim_emb.syn0.shape
    return gensim_emb, vec, shape, vocab

In [7]:
gensim_emb, vec, shape, vocab = gensim_load_vec()

  
  if __name__ == '__main__':


In [8]:
counter = CountVectorizer(tokenizer=ut.tokenizer, min_df=1)

In [9]:
X = counter.fit_transform(train_tweets + test_tweets)
print(X.shape)

(1111, 4227)


In [10]:
VOCAB_SIZE = X.shape[1]
embedding_matrix = np.zeros((VOCAB_SIZE, 300))
for word in list(counter.vocabulary_.keys()):
    try:
        i = counter.vocabulary_[word]
        embedding_matrix[i] = gensim_emb[word]
    except KeyError:
        pass
print(embedding_matrix.shape)

(4227, 300)


In [11]:
np.array_equal(embedding_matrix[counter.vocabulary_['hola']], gensim_emb['hola'])

True

In [12]:
sequences = []
for tweet in train_tweets+test_tweets:
    sentence = []
    for word in ut.tokenizer(tweet):
        try:
            i = counter.vocabulary_[word]
            sentence.append(i)
        except KeyError:
            pass
        
    sequences.append(sentence)


In [13]:
from keras.preprocessing.sequence import pad_sequences
x_train_seq = pad_sequences(sequences[:len(x_train)], maxlen=45)
x_test_seq  = pad_sequences(sequences[-len(x_test):], maxlen=45)
print('Shape of data train tensor:', x_train_seq.shape)
print('Shape of data val  tensor:', x_test_seq.shape)

Using TensorFlow backend.


Shape of data train tensor: (736, 45)
Shape of data val  tensor: (375, 45)


# Model: Convolutional Neural Network

In [14]:
seed = 7

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D

tweet_input = Input(shape=(45,), dtype='int32')

tweet_encoder = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=45, trainable=True)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 45)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 45, 300)      1268100     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 44, 100)      60100       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 43, 100)      90100       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [15]:
from keras.callbacks import ModelCheckpoint

filepath="model/CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, y_train, batch_size=64, epochs=10,
                     validation_data=(x_test_seq, y_test), callbacks = [checkpoint])

Train on 736 samples, validate on 375 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.58400, saving model to model/CNN_best_weights.01-0.5840.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.58400 to 0.63733, saving model to model/CNN_best_weights.02-0.6373.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.63733 to 0.67733, saving model to model/CNN_best_weights.03-0.6773.hdf5
Epoch 4/10

Epoch 00004: val_acc improved from 0.67733 to 0.72533, saving model to model/CNN_best_weights.04-0.7253.hdf5
Epoch 5/10

Epoch 00005: val_acc improved from 0.72533 to 0.76533, saving model to model/CNN_best_weights.05-0.7653.hdf5
Epoch 6/10

Epoch 00006: val_acc improved from 0.76533 to 0.77600, saving model to model/CNN_best_weights.06-0.7760.hdf5
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.77600
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.77600
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.77600
Epoch 10/10

Epoch 00010: val_acc did not imp

<keras.callbacks.History at 0x2417e2baa20>

In [16]:
#from keras.models import load_model

#loaded_CNN_model = load_model('model/CNN_best_weights.05-0.5400.hdf5')
#loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)
model.evaluate(x=x_test_seq, y=y_test)



[0.6379356830120086, 0.7573333315849304]

In [17]:
model.metrics_names

['loss', 'acc']