In [81]:
import os
import pandas as pd
import nltk
import numpy as np

In [82]:
df = pd.read_csv(os.path.join('..', 'storage', 'base_sentiment.1.0.2.csv'))
df

Unnamed: 0,text,sentiment
0,"Mais uma vez, o Sr. Costner arrumou um filme p...",negative
1,Este é um exemplo do motivo pelo qual a maiori...,negative
2,"Primeiro de tudo eu odeio esses raps imbecis, ...",negative
3,Nem mesmo os Beatles puderam escrever músicas ...,negative
4,Filmes de fotos de latão não é uma palavra apr...,negative
...,...,...
49454,"Como a média de votos era muito baixa, e o fat...",positive
49455,O enredo teve algumas reviravoltas infelizes e...,positive
49456,Estou espantado com a forma como este filme e ...,positive
49457,A Christmas Together realmente veio antes do m...,positive


In [83]:
df['sentiment'].value_counts()

sentiment
negative    24765
positive    24694
Name: count, dtype: int64

In [84]:
df.shape

(49459, 2)

In [85]:
base_training = df.values[:int(df.shape[0] / 100 * 80)]
base_test = df.values[int(df.shape[0] / 100 * 20):]

In [86]:
list_stopwords = nltk.corpus.stopwords.words('portuguese')

In [87]:
def remove_stopwords(text):
    frases = []
    for (word, sentiment) in text:
        no_stopwords = [ w for w in word.split() if w not in list_stopwords]
        frases.append((no_stopwords, sentiment))
    return frases

In [88]:
def stemmer(text):
    stem = nltk.stem.RSLPStemmer()
    list_with_stem = []
    for (word, sentiment) in text:
        with_stem = [str(stem.stem(w)) for w in word.split() if w not in list_stopwords]
        list_with_stem.append((with_stem, sentiment))
    return list_with_stem

In [89]:
list_with_stemmer = stemmer(df.values)

In [90]:
def search_word(words):
    all_words = []
    for word, sentiment in words:
        all_words.append(word)
    return all_words

In [91]:
word_with_stemmer_training = stemmer(base_training)
word_with_stemmer_test = stemmer(base_test)

In [92]:
def get_freq(words):
    words = nltk.FreqDist(words)
    return words

In [93]:
word_training = search_word(word_with_stemmer_training)
word_test = search_word(word_with_stemmer_test)

In [94]:
freq_training = get_freq(word_training[0])
freq_test = get_freq(word_test[0])

In [95]:
# função para retornar somente palavras únicas
def search_word_unique(freq):
    freq = freq.keys()
    return freq 

word_unique_training = search_word_unique(freq_training)
word_unique_test = search_word_unique(freq_test)

In [96]:
# função para identificar quais palavras únicas estão no documento
def extractor_word(document):
    doc = set(document)
    feature = {}
    for word in word_unique_training:
        feature['%s' % word] = (word in doc)
    return feature

# função para identificar quais palavras únicas estão no documento
def extractor_word_test(document):
    doc = set(document)
    feature = {}
    for word in word_unique_test:
        feature['%s' % word] = (word in doc)
    return feature

In [97]:
base_complete_training = nltk.classify.apply_features(extractor_word, word_with_stemmer_training)
base_complete_test = nltk.classify.apply_features(extractor_word_test, word_with_stemmer_test)

In [98]:
classifier = nltk.NaiveBayesClassifier.train(base_complete_training)

In [99]:
classifier.labels()

['negative', 'positive']

In [100]:
classifier.show_most_informative_features(10)

Most Informative Features
                  deslig = True           negati : positi =      3.3 : 1.0
                kutcher. = True           positi : negati =      2.8 : 1.0
                costner. = True           negati : positi =      2.6 : 1.0
                  nenhum = True           negati : positi =      2.0 : 1.0
                obstácul = True           positi : negati =      1.8 : 1.0
                  armári = True           negati : positi =      1.8 : 1.0
              importava. = True           negati : positi =      1.8 : 1.0
                   mágic = True           positi : negati =      1.8 : 1.0
                    terr = True           negati : positi =      1.8 : 1.0
                    venc = True           positi : negati =      1.8 : 1.0


In [101]:
nltk.classify.accuracy(classifier, base_complete_test)

0.4195056611403154

In [102]:
errors = []
for (frase, classe) in base_complete_test:
    result = classifier.classify(frase)
    if result != classe:
        errors.append((classe, result, frase))

In [103]:
from nltk.metrics import ConfusionMatrix
hope = []
predict = []
for (frase, classe) in base_complete_test:
    result = classifier.classify(frase)
    predict.append(result)
    hope.append(classe)

matrix = ConfusionMatrix(hope, predict)
print(matrix)

         |     n     p |
         |     e     o |
         |     g     s |
         |     a     i |
         |     t     t |
         |     i     i |
         |     v     v |
         |     e     e |
---------+-------------+
negative |<13991>  883 |
positive | 22086 <2608>|
---------+-------------+
(row = reference; col = test)



In [104]:
text = "nossa, que notícia maravilhosa!"
test_stemming = []
stemmer = nltk.RSLPStemmer()
for word in text.split():
    with_stem = [w for w in word.split()]
    test_stemming.append(str(stemmer.stem(with_stem[0])))

novo = extractor_word(test_stemming)

dist = classifier.prob_classify(novo)
for classe in dist.samples():
    print('%s: %f' % (classe, dist.prob(classe)))

negative: 0.318475
positive: 0.681525


In [105]:
text = "Pqp, que trânsito chato da porra!"
test_stemming = []
stemmer = nltk.RSLPStemmer()
for word in text.split():
    with_stem = [w for w in word.split()]
    test_stemming.append(str(stemmer.stem(with_stem[0])))

novo = extractor_word(test_stemming)

dist = classifier.prob_classify(novo)
for classe in dist.samples():
    print('%s: %f' % (classe, dist.prob(classe)))

negative: 0.318475
positive: 0.681525
