## Analisis de Sentimientos

#### Por: Mauricio Santiago Valdovinos Morales

Clasificación de textos de opinión en español utilizando una red neuronal llamada Word2Vec.

In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities
from os import path,listdir
import pickle
import time

reviews_doc = "data/reviews_f.dat"
reviews_doc_stop = "data/reviews_with_stopwords.dat"

with open("data/stop_words_spanish.dat", "rb") as f:
            stop_wordsE = pickle.load(f)



In [4]:
def save_data(file,dic):
    with open(file, "wb") as f:
        pickle.dump(dic, f)

def load_data(file):
    with open(file, "rb") as f:
            return pickle.load(f)
        
def remove_stop_words(corpus):
    results = []
    for text in corpus:
        sentence = []
        for p in text:
            if p not in stop_wordsE:
                sentence.append(p)
        results.append(sentence)
    
    return results

def parser_review(path):
    file = open(path).read()
    dic = {'review':"",'review_clean':[],'sentiment': ""}
    
    #Rank
    div = file.split("rank=\"")
    rank = div[1][0]

    #Review
    div = file.split("<body>")
    div = div[1].split("</body>")
    dic['review'] = div[0]
    
    #Sentiment
    if rank == '1' or rank == '2':
        dic['sentiment'] = 'negative'
    elif rank == '3':
        dic['sentiment'] = 'neutral'
    else:
        dic['sentiment'] = 'positive'
    
    return dic

def list_reviews_features(dic):
    list_corp = []
    list_ranks = []
    list_corp_clean = []
    for r in range(len(dic)):
        list_corp.append(dic[r]['review'])
        list_ranks.append(dic[r]['sentiment'])
        list_corp_clean.append(dic[r]['review_clean'])
    return list_corp, list_ranks, list_corp_clean

def w2v_input(revs):
    w2v_int = []
    for r in revs:
        w2v_int.append(r['review_clean'])
    return w2v_int
    
def create_sentiment_dic(vocabulary):
    sentiments = {}
    for word in vocabulary:
        sentiments[word] = []
    return sentiments

def def_sentiments_words(vocabulary,reviews):
    dic = create_sentiment_dic(vocabulary)
    for doc in reviews:
        word_sentiment = []
        for rc in doc['review_clean']:
            for sentence in rc:
                for word in vocabulary:
                    if word in sentence:
                        if len(dic[word]) < 4 and doc['sentiment'] not in dic[word]:
                            dic[word].append(doc['sentiment'])
    return dic

## Generando archivo con diccionarios de la reviews

El diccionario tiene los siguientes campos:
 - Rank: Numero de estrellas de la review 1-5
 - Title: Titulo de la review
 - Summaty: pequeño resumen
 - Review: Critica
 - review_clean: review tokenizada y filtrada lista para el modelo Word2Vec

In [6]:
def generate_reviews_doc():
    docs = []
    for f in sorted(listdir('Corpora')):
        if f[len(f)-3:] == "xml":
            f = path.join('Corpora',f)
            docs.append(parser_review(f))
    for review in docs:
        review['review_clean'] = word_tokenize(review['review'])
        for i in range(len(review['review_clean'])):
            review['review_clean'][i]  = review['review_clean'][i].lower()
        review['review_clean'] = [word for word in review['review_clean'] if word not in stop_wordsE]
    save_data(reviews_doc,docs)

In [7]:
generate_reviews_doc()

In [8]:
reviews = load_data(reviews_doc)
len(reviews)

3878

## Definiendo lista de entrenamiento y prueba

In [9]:
point = int(len(reviews) * (4/5))
train = reviews[:point]
test = reviews[point:]
print("Entrenamiento: ",len(train),"Prueba: ",len(test))

Entrenamiento:  3102 Prueba:  776


In [10]:
#Uniendo todas las oraciones en un vector para la entrada al modelo w2v
w2v_in_train = w2v_input(train)
w2v_in_test = w2v_input(test)

## Entrenando el modelo word2vec

In [72]:
w2v_model = Word2Vec(w2v_in_train+w2v_in_test, size=200, min_count=5, window=5, workers=4, sg=1)
w2v_model.train(w2v_in_train,total_examples=len(w2v_in_train),epochs=100)

(66488151, 75837800)

Guardando el modelo Word2Vec

In [52]:
w2v_model.save('data/model_2.bin')

Cargando el modelo guardado

In [75]:
w2v_model = Word2Vec.load('data/model_2.bin')

Definiendo vocabulario entregado por w2v

In [76]:
vocabulary = list(w2v_model.wv.vocab)
print(len(vocabulary))

20386


In [77]:
vocabulary[15]

'encuentran'

In [78]:
print(w2v_model.wv.distance('película','cine'))

0.6535177230834961


  if np.issubdtype(vec.dtype, np.int):


#### Diccionario de las palabras

Diccionario con el indice de la palabra en el vocabulario

In [15]:
def vocab_dictionary(vocab):
    dic_words = {}
    i = 0
    for w in vocab:
        dic_words[w] = i
        i+=1
    return dic_words

In [16]:
dic_words = vocab_dictionary(vocabulary)
print(len(dic_words))

20386


#### Sentimientos a nivel de palabras

Diccionario con la palabra y una lista de los sentimientos que pueden ser positive, negative, neutral <br>

'word': ['positive', 'negative', 'neutral']

In [None]:
sentiments = def_sentiments_words(vocabulary,reviews)
print(len(sentiments))

Guardando el diccionario de sentimientos de las palabras

In [18]:
save_data('data/sentiments_dic.dat',sentiments)

Cargando el diccionario de sentimientos de las palabras

In [57]:
sentiments = load_data('data/sentiments_dic.dat')
print(len(sentiments))

20386


In [None]:
word = vocabulary[dic_words['peliculas']]
print(word)
print(w2v_model[word])

## Probando modelo

In [83]:
def test_w2v(test_list,w2v,vocab,sentiments_list):
    prediction_count = 0
    for t in test_list:
        sentiment = word_prediction(w2v,t['review_clean'],vocab,sentiments_list)
        #print("Sentiment prediction: ",sentiment," real sentiment: ",t['sentiment'])
        if sentiment == t['sentiment']:
            prediction_count+=1
    accuracy = prediction_count/len(test)*100
    print("Accuracy = ",accuracy,"% of ",len(test)," reviews")
    
def test_w2v_D(test_list,train_list,w2v,vocab,sentiments_list):
    prediction_count = 0
    for t in test_list:
        sentiment = KNN_documents(t,train_list,w2v)
        #print("Sentiment prediction: ",sentiment," real sentiment: ",t['sentiment'])
        if sentiment == t['sentiment']:
            prediction_count+=1
    accuracy = prediction_count/len(test)*100
    print("Accuracy = ",accuracy,"% of ",len(test)," reviews")

def prediction(critic,w2v,vocab,sentiments_list):
    sentiment = word_prediction(w2v,critic,vocab,sentiments_list)
    print(sentiment)

def word_prediction(w2v,sentence,vocab,sentiments_list):
    words_sentiments = []
    for word in sentence:
        if word in vocab:
            KNN_words(w2v,word,sentiments_list,words_sentiments)
    #print(words_sentiments)      
    return classify_words(words_sentiments)

def KNN_words(w2v,word,sentiments_list,words_sentiments):
    K = 5
    neighbors = w2v.wv.most_similar(word)
    for neighbor in neighbors[:K]:
        words_sentiments.append(sentiments_list[neighbor[0]])

def KNN_documents(rev,corpus,w2v):
    k = 10
    distances = []
    i=0
    for c in corpus:
        d = []
        d.append(c['sentiment'])
        d.append(w2v.wmdistance(rev,c['review_clean']))
        distances.append(d)
        print(i,end="\r")
        i+=1
    distances.sort(key=lambda x:x[1])
    neighbors_sentiments = [s[0] for s in distances[:k]]
    return classify_sentence(neighbors_sentiments)

    
def classify_words(sentiments_prediction):
    pos = 0
    neg = 0
    cont = 0
    for s in sentiments_prediction:
        if 'positive' in s:
            pos+=1
        if 'negative' in s:
            neg+=1
        cont+= len(s)
    neu = cont - (pos+neg)
    if pos > neg and pos > neu or pos == neu:
        return 'positive'
    if neg > pos and neg > neu or neg == neu:
        return 'negative'
    return 'neutral'

def classify_sentence(sentiments_prediction):
    pos = 0
    neg = 0
    cont = 0
    for s in sentiments_prediction:
        if 'positive' in s:
            pos+=1
        if 'negative' in s:
            neg+=1
    neu = len(sentiments_prediction) - (pos+neg)
    #print("Pos: ",pos, "Neg:", neg, "Neu: ",neu)
    if pos > neg and pos > neu or pos == neu:
        return 'positive'
    if neg > pos and neg > neu or neg == neu:
        return 'negative'
    return 'neutral'

In [81]:
start = time.time()
test_w2v(test,w2v_model,vocabulary,sentiments)
end = time.time()
print("time: %.4fs"%(end-start))

  if np.issubdtype(vec.dtype, np.int):


Accuracy =  48.840206185567006 % of  776  reviews
time: 445.1612s


In [84]:
start = time.time()
test_w2v_D(test,train,w2v_model,vocabulary,sentiments)
end = time.time()
print("time: %.4fs"%(end-start))



Accuracy =  37.75773195876289 % of  776  reviews
time: 754.7200s


In [49]:
critic = "Un spaguetti muy menor, mal rodado y peor interpretado. Dejando que un personaje mitico dentro de este subgenero como es \\\"Sabata\\\" sea visto de una manera tan espantosa."

In [51]:
def prepare_critic(critic):
    critic = word_tokenize(critic)
    critic = [word.lower() for word in critic if word not in stop_wordsE]
    return critic
    

In [52]:
critic = prepare_critic(critic)
print(critic)

['un', 'spaguetti', 'menor', 'mal', 'rodado', 'peor', 'interpretado', 'dejando', 'personaje', 'mitico', 'dentro', 'subgenero', '\\', 'sabata\\', 'visto', 'manera', 'tan', 'espantosa']


In [55]:
prediction(critic,w2v_model,vocabulary,sentiments)

neutral


  if np.issubdtype(vec.dtype, np.int):
