In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report
from gensim.models import word2vec
import re
import nltk
from nltk.tokenize import TweetTokenizer
from emoji.unicode_codes import UNICODE_EMOJI
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('../Data/dataset2018.tsv', header=0, delimiter='\t')

In [3]:
data = data[data['classe'] != 'Não sei']
data['classe'][data['classe'] == 'Rejeição'] = 2
data['classe'][data['classe'] == 'Neutro'] = 1
data['classe'][data['classe'] == 'Aprovação'] = 0
data = data.dropna()

In [4]:
data.head(10)

Unnamed: 0,id,tweet,candidato,marcador,classe
2,twe984802485360582656,Ao invés dos petistas estarem buscando livrar ...,alckmin,alysson,2
3,twe977572021361168389,@CshmKnCaioHen @Peaotrabalhador @MiguelAMSA61 ...,manuela,alysson,1
4,twe977558211447443457,"A pergunta é séria, @manudeputada: quer me pag...",manuela,alysson,2
5,twe977347407011897345,DISPARADO!!! 90%... https://t.co/f5G9063duC,bolsonaro,alysson,1
6,twe984494027956543488,O que acontece agora que o inquérito de Alckmi...,alckmin,alysson,1
7,twe981677295676084224,"- ""aécio é flagrado pedindo grana a empresário...",temer,alysson,2
8,twe978200570740822017,Se a bunda de algum ministro sentar sobre o pr...,lula,alysson,2
9,twe982682347484180482,"Hoje, não há lado certo ou lado errado. Lula é...",temer,alysson,1
10,twe984980191066849280,Decisões do STF e do STJ de encaminhar process...,alckmin,alysson,1
11,twe983555366133944320,PARA OS QUE AINDA NÃO SABEM! A pré-candidata à...,marina,alysson,1


In [5]:
def preProcessing(twitterText):
    #Remover \n
    twitterText = re.sub("\n+"," ",twitterText)

    #Remover multiplos espaços
    twitterText = re.sub(" +"," ",twitterText)
    
    #(@usuário) pelo termo ’AT_USER’ tal como sugerido em [Almatrafi et al., 2015].
    twitterText = re.sub("@\w+","atuser",twitterText)

    #Remove links
    twitterText = re.sub(r"http\S+", "",twitterText)

    #Remover caracteres especiais
    twitterText = re.sub("[@|#|“|”|’|‘|®|,|!|?||\[|\]|\.|\"|%|:|\-|_|/|ª|\(|\)|°|\*|🇧|🇷|\'|️|=]",'',twitterText)

    #Remover números
    twitterText = re.sub("[0-9]+",'',twitterText)

    #Tokenize
    twitterTokens = TweetTokenizer().tokenize(twitterText)

    #transforme emojis em textcode
    twitterTokensEmojisCode = []
    for token in twitterTokens:
        if(token in UNICODE_EMOJI):
            twitterTokensEmojisCode.append(UNICODE_EMOJI[token])
        else:
            twitterTokensEmojisCode.append(token)
    twitterTokens = twitterTokensEmojisCode

    #remove stopwords
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stopwords.remove("não")
    stopwords.remove("num")
    twitterTokens = [token for token in twitterTokens if (token not in stopwords) ]
    
    #Lower case
    twitterText = "".join(twitterText)
    twitterText = twitterText.lower()

    return twitterText

In [6]:
data['tweet'] = data['tweet'].apply(lambda x : preProcessing(x))

In [7]:
data.head(5)

Unnamed: 0,id,tweet,candidato,marcador,classe
2,twe984802485360582656,ao invés dos petistas estarem buscando livrar ...,alckmin,alysson,2
3,twe977572021361168389,atuser atuser atuser atuser o problema caio é ...,manuela,alysson,1
4,twe977558211447443457,a pergunta é séria atuser quer me pagar logo o...,manuela,alysson,2
5,twe977347407011897345,disparado,bolsonaro,alysson,1
6,twe984494027956543488,o que acontece agora que o inquérito de alckmi...,alckmin,alysson,1


# Text feature extraction

In [8]:
X = data['tweet']
Y = data['classe'].values


Apply w2vec em X

In [9]:
class FeatureGeneratorMedia:
    def __init__(self, X, w2vmodel, num_features):
        self.X = X
        self.w2vmodel = w2vmodel
        self.num_features = num_features
        self.features_vec = None

    def gen_features_dataset(self):
        self.X = self.X.apply(lambda text: TweetTokenizer().tokenize(text) )
        X_array = []
        self.X.apply(lambda listText: X_array.append(self.make_features_vec(listText)) )
        
        return np.matrix(X_array)
        
    def make_features_vec(self, tweet):
        featureVec = np.zeros(self.num_features)
        nwords = 0.0
        index2word_set = set(self.w2vmodel.wv.index2word)
        for word in tweet:
            if word in index2word_set:
                featureVec = np.add(featureVec, self.w2vmodel[word])
                nwords += 1
        if nwords == 0.0:
            nwords = 1.0
        return np.divide(featureVec, nwords)

def featureextractionWord2VecMean(X):
    num_features=300
    model = word2vec.Word2Vec.load("word2vec/tweets_presidential_elections_300_min1_cont2_cbow")
    featureGeneratorMedia = FeatureGeneratorMedia(X,model,num_features)
    return featureGeneratorMedia.gen_features_dataset()
    

In [10]:
X = featureextractionWord2VecMean(X)



In [11]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)

### RNN (LSTM)

Define the RNN structure

In [12]:
model = word2vec.Word2Vec.load("word2vec/tweets_presidential_elections_300_min1_cont2_cbow")
vocab_size = len(model.wv.vocab)


In [13]:
w2vec_dim=300
output_dim=3
lstm_dim=300
dropout=0.2


In [14]:
model = Sequential()
model.add(Embedding(vocab_size,w2vec_dim,trainable=True))
model.add(Dropout(dropout))
model.add(LSTM(lstm_dim))
model.add(Dropout(dropout))
model.add(Dense(output_dim, activation='softmax'))
    
model.compile('adam', 'categorical_crossentropy',metrics=['accuracy'])

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         42329100  
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 300)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 903       
Total params: 43,051,203
Trainable params: 43,051,203
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(X_train, Y_train, epochs=50, verbose=0)

ValueError: Error when checking target: expected dense_1 to have shape (3,) but got array with shape (1,)