In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report
from gensim.models import word2vec
import re
import nltk
from nltk.tokenize import TweetTokenizer
from emoji.unicode_codes import UNICODE_EMOJI
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('../Data/dataset2018.tsv', header=0, delimiter='\t')

In [3]:
data = data[data['classe'] != 'N√£o sei']
data['classe'][data['classe'] == 'Rejei√ß√£o'] = 'REJEICAO'
data['classe'][data['classe'] == 'Neutro'] = 'NEUTRO'
data['classe'][data['classe'] == 'Aprova√ß√£o'] = 'APROVACAO'
data = data.dropna()

In [4]:
data.head(10)

Unnamed: 0,id,tweet,candidato,marcador,classe
2,twe984802485360582656,Ao inv√©s dos petistas estarem buscando livrar ...,alckmin,alysson,REJEICAO
3,twe977572021361168389,@CshmKnCaioHen @Peaotrabalhador @MiguelAMSA61 ...,manuela,alysson,NEUTRO
4,twe977558211447443457,"A pergunta √© s√©ria, @manudeputada: quer me pag...",manuela,alysson,REJEICAO
5,twe977347407011897345,DISPARADO!!! 90%... https://t.co/f5G9063duC,bolsonaro,alysson,NEUTRO
6,twe984494027956543488,O que acontece agora que o inqu√©rito de Alckmi...,alckmin,alysson,NEUTRO
7,twe981677295676084224,"- ""a√©cio √© flagrado pedindo grana a empres√°rio...",temer,alysson,REJEICAO
8,twe978200570740822017,Se a bunda de algum ministro sentar sobre o pr...,lula,alysson,REJEICAO
9,twe982682347484180482,"Hoje, n√£o h√° lado certo ou lado errado. Lula √©...",temer,alysson,NEUTRO
10,twe984980191066849280,Decis√µes do STF e do STJ de encaminhar process...,alckmin,alysson,NEUTRO
11,twe983555366133944320,PARA OS QUE AINDA N√ÉO SABEM! A pr√©-candidata √†...,marina,alysson,NEUTRO


In [5]:
def preProcessing(twitterText):
    #Remover \n
    twitterText = re.sub("\n+"," ",twitterText)

    #Remover multiplos espa√ßos
    twitterText = re.sub(" +"," ",twitterText)
    
    #(@usu√°rio) pelo termo ‚ÄôAT_USER‚Äô tal como sugerido em [Almatrafi et al., 2015].
    twitterText = re.sub("@\w+","atuser",twitterText)

    #Remove links
    twitterText = re.sub(r"http\S+", "",twitterText)

    #Remover caracteres especiais
    twitterText = re.sub("[@|#|‚Äú|‚Äù|‚Äô|‚Äò|¬Æ|,|!|?||\[|\]|\.|\"|%|:|\-|_|/|¬™|\(|\)|¬∞|\*|üáß|üá∑|\'|Ô∏è|=]",'',twitterText)

    #Remover n√∫meros
    twitterText = re.sub("[0-9]+",'',twitterText)

    #Tokenize
    twitterTokens = TweetTokenizer().tokenize(twitterText)

    #transforme emojis em textcode
    twitterTokensEmojisCode = []
    for token in twitterTokens:
        if(token in UNICODE_EMOJI):
            twitterTokensEmojisCode.append(UNICODE_EMOJI[token])
        else:
            twitterTokensEmojisCode.append(token)
    twitterTokens = twitterTokensEmojisCode

    #remove stopwords
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stopwords.remove("n√£o")
    stopwords.remove("num")
    twitterTokens = [token for token in twitterTokens if (token not in stopwords) ]
    
    #Lower case
    twitterText = "".join(twitterText)
    twitterText = twitterText.lower()

    return twitterText

In [6]:
data['tweet'] = data['tweet'].apply(lambda x : preProcessing(x))

In [7]:
data.head(5)

Unnamed: 0,id,tweet,candidato,marcador,classe
2,twe984802485360582656,ao inv√©s dos petistas estarem buscando livrar ...,alckmin,alysson,REJEICAO
3,twe977572021361168389,atuser atuser atuser atuser o problema caio √© ...,manuela,alysson,NEUTRO
4,twe977558211447443457,a pergunta √© s√©ria atuser quer me pagar logo o...,manuela,alysson,REJEICAO
5,twe977347407011897345,disparado,bolsonaro,alysson,NEUTRO
6,twe984494027956543488,o que acontece agora que o inqu√©rito de alckmi...,alckmin,alysson,NEUTRO


# Text feature extraction

In [8]:
X = data['tweet']
Y = data['classe']

Apply w2vec em X

In [11]:
class FeatureGeneratorMedia:
    def __init__(self, X, w2vmodel, num_features):
        self.X = X
        self.w2vmodel = w2vmodel
        self.num_features = num_features
        self.features_vec = None

    def gen_features_dataset(self):
        self.X = self.X.apply(lambda text: TweetTokenizer().tokenize(text) )
        X_array = []
        self.X.apply(lambda listText: X_array.append(self.make_features_vec(listText)) )
        
        return np.matrix(X_array)
        
    def make_features_vec(self, tweet):
        featureVec = np.zeros(self.num_features)
        nwords = 0.0
        index2word_set = set(self.w2vmodel.wv.index2word)
        for word in tweet:
            if word in index2word_set:
                featureVec = np.add(featureVec, self.w2vmodel[word])
                nwords += 1
        if nwords == 0.0:
            nwords = 1.0
        return np.divide(featureVec, nwords)

def featureextractionWord2VecMean(X):
    num_features=300
    model = word2vec.Word2Vec.load("tweets_presidential_elections_min1_cont2_cbow")
    featureGeneratorMedia = FeatureGeneratorMedia(X,model,num_features)
    return featureGeneratorMedia.gen_features_dataset()
    

In [12]:
X = featureextractionWord2VecMean(X)



In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)

### RNN (LSTM)

Define the RNN structure

In [14]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model