In [42]:
import pandas as pd
import string
import spacy
import random
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re

from spacy.lang.pt.stop_words import STOP_WORDS
from spacy.training import Example
from sklearn.metrics import confusion_matrix, accuracy_score

### Databases

##### Train base

In [None]:
train_base = pd.read_csv('data/twitter/Train50.csv', delimiter=';')
train_base

In [None]:
train_base.shape

In [None]:
train_base.head()

In [None]:
sns.countplot(x='sentiment', hue='sentiment', data=train_base, palette='viridis', legend=False);

In [47]:
train_base.drop(['id', 'tweet_date', 'query_used'], axis=1, inplace=True)

In [None]:
train_base.head()

In [None]:
sns.heatmap(pd.isnull(train_base));

##### Test base

In [None]:
test_base = pd.read_csv('data/twitter/Test.csv', delimiter=';')
test_base

In [None]:
test_base.shape

In [52]:
test_base.drop(['id', 'tweet_date', 'query_used'], axis=1, inplace=True)

In [None]:
test_base.head()

In [None]:
sns.countplot(x='sentiment', hue='sentiment', data=test_base, palette='viridis', legend=False);

In [None]:
sns.heatmap(pd.isnull(test_base));

### Text preprocessing function

In [None]:
nlp = spacy.load('pt_core_news_sm')
nlp

In [57]:
stop_words = spacy.lang.pt.stop_words.STOP_WORDS

In [58]:
def preprocessing(text):
    text = text.lower()
    
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove usernames
    
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text) # remove urls
    
    text = re.sub(r" +", ' ', text) # remove extra spaces
    
    # Emoticons
    emoction_list = {':)': 'emocaopositiva', 
                     ':-)': 'emocaopositiva', 
                     ';)': 'emocaopositiva', 
                     ':(': 'emocaonegativa', 
                     ':-(': 'emocaonegativa'}
    
    for emot in emoction_list:
        text = text.replace(emot, emoction_list[emot])
    
    # Lemmatization    
    doc = nlp(text)
    
    list = []
    for token in doc:
        list.append(token.lemma_)
        
    # Stop words and punctuation
    list = [word for word in list if word not in stop_words and word not in string.punctuation]
    list = ' '.join([str(element) for element in list if not element.isdigit()])
    
    return list

### Preprocessing on database

##### Text cleaning

In [None]:
train_base['tweet_text'] = train_base['tweet_text'].apply(preprocessing)

##### Class treatment

In [60]:
exemple_base = [["não deixe seus sonhos serem apenas sonhos", {"POSITIVO": True, "NEGATIVO": False}],
                ["que tistreza", {"POSITIVO": False, "NEGATIVO": True}]]

In [None]:
train_base_final = []

i = 0

for text, emotion in zip(train_base['tweet_text'], train_base['sentiment']):
    if emotion == 'alegria':
        dic = {"ALEGRIA": True, "MEDO": False}
    else:
        dic = {"ALEGRIA": False, "MEDO": True}
        
    train_base_final.append([text, dic.copy()])

In [None]:
len(train_base_final)

### Building the classifier

In [64]:
model = spacy.blank('pt')
categories = model.add_pipe('textcat')
categories.add_label("POSITIVO")
categories.add_label("NEGATIVO")
history = []

In [None]:
model.begin_training()
for epoch in range(1000):
    random.shuffle(train_base_final)
    losses = {}
    for batch in spacy.util.minibatch(train_base_final, size=512):
        examples = [Example.from_dict(model.make_doc(text), {'cats': entities}) for text, entities in batch]
        model.update(examples, losses=losses)
    if epoch % 5 == 0:
        print(losses)
        history.append(losses)

In [None]:
history_loss = []
for i in history:
    history_loss.append(i.get('textcat'))

In [None]:
history_loss = np.array(history_loss)
history_loss

In [None]:
plt.plot(history_loss)
plt.title('Progressão do erro')
plt.xlabel('Batches')
plt.ylabel('Erro')

In [None]:
model.to_disk('data/twitter/model')

### One phrase test

In [None]:
loaded_model = spacy.load('data/twitter/model')
loaded_model

##### Positive Text

In [None]:
positive_text = test_base['tweet_text'][21]

In [None]:
prediction = loaded_model(positive_text)
prediction

In [None]:
prediction.cats

In [None]:
positive_text = "Deixe que o medo te traga coragem!"
positive_text = preprocessing(positive_text)
positive_text

##### Negative text

In [None]:
negative_text = test_base['tweet_text'][4000]
prediction = loaded_model(negative_text)
prediction.cats

### Model evaluation

##### Train base evaluation

In [None]:
predictions = []
for text in test_base['tweet_text']:
    prediction = loaded_model(text)
    predictions.append(prediction.cats)

In [None]:
final_predictions = []
for prediction in predictions:
    if prediction['POSITIVO'] > prediction['NEGATIVO']:
        final_predictions.append('1')
    else:
        final_predictions.append('0')
final_predictions = np.array(final_predictions)

In [None]:
real_ans = test_base['sentiment'].values
real_ans

In [None]:
accuracy_score(real_ans, final_predictions)

In [None]:
cm = confusion_matrix(real_ans, final_predictions)
cm

In [None]:
sns.heatmap(cm, annot=True, fmt='d', cmap='viridis')