In [1]:
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score
from tabulate import tabulate
import pandas as pd
import numpy as np
import gensim
import nltk
import re
import os

In [2]:

stopwords = stopwords.words('english')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [11]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec) > 0:
            self.dim = len(word2vec)
        else:
            self.dim = 0

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec) > 0:
            self.dim = len(word2vec)
        else:
            self.dim = 0

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer= lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                     for w in words if w in self.word2vec] or
                    [np.zeros(self.dim)], axis=0)
            for words in X
])

In [12]:
def loadDataset_Review(file,clas):
    X, y = [], []
    for index,line in file.iterrows():
        review_text = BeautifulSoup(line["review"], "html.parser").get_text()
        #
        # 2. Remove caractéres não alfa-numéricos
        review_text = re.sub("[^a-zA-Z]", " ", review_text).lower()
        review_text = [w for w in tokenizer.tokenize(review_text) if w not in stopwords]
        if clas == True:
            label = line['sentiment']
            y.append(label)
        text = review_text
        X.append((text))
        
    return np.array(X), np.array(y)

In [5]:

train = pd.read_csv('labeledTrainData.tsv',delimiter="\t", quoting=3)
train_unl = pd.read_csv('unlabeledTrainData.tsv',delimiter="\t", quoting=3)

X_w2v, y_w2v = loadDataset_Review(train,True)

X_w2v_unl, y_w2v_unl = loadDataset_Review(train_unl,False)

X_w2v = np.concatenate((X_w2v, X_w2v_unl), axis=0)

train, test = train_test_split(train, test_size=0.3, train_size=0.7, random_state=42)

X,y = loadDataset_Review(train, True)

X_test,y_test = loadDataset_Review(test, True)


In [7]:
# Configura valores para o word2vec
num_features = 300  # Word vector dimensionality
min_word_count = 40  # Minimum word count
num_workers = 4  # Number of threads to run in parallel
context = 10  # Context window size
downsampling = 1e-3  # Downsample setting for frequent words
model_name = "300features_40minwords_10context"
try:
    model = gensim.models.Word2Vec.load(model_name)
except:
    model = gensim.models.Word2Vec(X_w2v, workers=num_workers, \
                size=num_features, min_count=min_word_count, \
                window=context, sample=downsampling, seed=1)
    model.save(model_name)
    print("Gerou o modelo")
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [13]:

mult_nb = Pipeline(
    [("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

mult_nb_tfidf = Pipeline(
    [("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])

bern_nb_tfidf = Pipeline(
    [("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

svc = Pipeline(
    [("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", LinearSVC(max_iter=10**4))])

svc_tfidf = Pipeline(
    [("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", LinearSVC(max_iter=10**4))])

etree_w2v = Pipeline(
    [("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),("extra trees", ExtraTreesClassifier(n_estimators=200))])

etree_w2v_tfidf = Pipeline(
    [("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
                                ("extra trees", ExtraTreesClassifier(n_estimators=200))])

random_w2v_tfidf = Pipeline(
    [("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
                                ("extra trees", RandomForestClassifier(n_estimators=200))])
random_w2v = Pipeline(
    [("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
("extra trees", RandomForestClassifier(n_estimators=200))])

In [14]:
all_models = [
        ("mult_nb", mult_nb),
        ("mult_nb_tfidf", mult_nb_tfidf),
        ("bern_nb", bern_nb),
        ("bern_nb_tfidf", bern_nb_tfidf),
        ("Linearsvc", svc),
        ("Linearsvc_tfidf", svc_tfidf),
        ("w2v", etree_w2v),
        ("w2v_tfidf", etree_w2v_tfidf),
        ("random_w2v", random_w2v),
        ("random_w2v_tfidf", random_w2v_tfidf),
    ]

In [10]:

unsorted_scores = []
for name, model in all_models:
    print("Training with ", name)
    predict = model.fit(X,y).predict(X_test)
    unsorted_scores.append((name,accuracy_score(y_test,predict),\
    f1_score(y_test,predict), precision_score(y_test,predict),recall_score(y_test,predict)))


Training with  mult_nb
Training with  mult_nb_tfidf
Training with  bern_nb
Training with  bern_nb_tfidf
Training with  Linearsvc
Training with  Linearsvc_tfidf
Training with  w2v
Passou
Passou
Training with  w2v_tfidf
Training with  random_w2v
Passou
Passou
Training with  random_w2v_tfidf


In [15]:
scores = sorted(unsorted_scores, key=lambda x: -x[1])

print(tabulate(scores, floatfmt=".4f", headers=("model", 'Accuracy','F1','Precision','Recall')))

model               Accuracy      F1    Precision    Recall
----------------  ----------  ------  -----------  --------
Linearsvc_tfidf       0.8912  0.8923       0.8860    0.8987
mult_nb_tfidf         0.8677  0.8671       0.8739    0.8604
Linearsvc             0.8641  0.8649       0.8627    0.8671
mult_nb               0.8627  0.8610       0.8745    0.8480
random_w2v            0.8567  0.8606       0.8404    0.8817
random_w2v_tfidf      0.8544  0.8582       0.8390    0.8783
w2v                   0.8539  0.8574       0.8397    0.8759
w2v_tfidf             0.8524  0.8562       0.8374    0.8759
bern_nb               0.8485  0.8426       0.8799    0.8083
bern_nb_tfidf         0.8485  0.8426       0.8799    0.8083
