In [51]:
import os
import csv
import math
import numpy as np
import pandas as pd
from collections import Counter
import gensim
import nltk
import functools as ft
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score,precision_recall_fscore_support as score
script_path = os.path.dirname(os.path.abspath('__file__'))
stopwords = set(
    ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
     'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
     'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was',
     'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
     'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
     'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
     'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
     'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
     'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
STOP=stopwords
PATH_TO_WORD2VEC ="GoogleNews-vectors-negative300.bin"
PATH_TO_GLOVE = "glove.840B.300d.txt"

In [52]:
print("Loading word2vec model")
word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)
print("Done.",len(word2vec.vocab)," words loaded!")

Loading word2vec model
Done. 3000000  words loaded!


In [53]:
print("Loading glove model")
df = pd.read_csv('glove.840B.300d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove = {key: val.values for key, val in df.T.items()}
print("Done.",len(glove)," words loaded!")

Loading glove model
Done. 1176051  words loaded!


In [54]:
PATH_TO_FREQUENCIES_FILE = "frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "doc_frequencies.tsv"
##Sentence class where we keep both the raw sentence and the tokenized sentence
class Sentence:  
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]
        
## tsv file reader
def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies

## In order to compute weighted averages of word embeddings later, we are going to load a file 
## with word frequencies. These word frequencies have been collected from Wikipedia and saved 
## in a tab-separated file.

frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431       

In [55]:
## Simplest way of computing sentence embeddings: just take the embeddings of the words 
## in the sentence (minus the stopwords), and compute their average, weighted by the sentence 
## frequency of each word. We then use the cosine similarity to calculate the similarity between 
## two sentence embeddings.
def avg_embedding_sim(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    sims = []
    for (sent1, sent2) in zip(sentences1, sentences2):
    
        tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
        tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

        tokens1 = [token for token in tokens1 if token in model]
        tokens2 = [token for token in tokens2 if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            sims.append(0)
            continue
        
        tokfreqs1 = Counter(tokens1)
        tokfreqs2 = Counter(tokens2)
        
        weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1] if doc_freqs else None
        weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2] if doc_freqs else None
                
        embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
        embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

        sim = cosine_similarity(embedding1, embedding2)[0][0]
        sims.append(sim)

    return sims


In [56]:
### Unit testing...
methods = [("AVG-W2V-TFIDF-STOP", ft.partial(avg_embedding_sim, model=word2vec, use_stoplist=True, doc_freqs=doc_frequencies)),
              ("AVG-GLOVE-TFIDF-STOP", ft.partial(avg_embedding_sim, model=glove, use_stoplist=True, doc_freqs=doc_frequencies))]

s1="He was right, of course, but his harsh words were like salt on a raw wound."
s2="In truth, the raw information funneled to us was transmitted as received after passing through our office."
sentences1 = [Sentence(s1)]
sentences2 = [Sentence(s2)]
for _,method in methods:
    sim = method(sentences1, sentences2)[0]
    print(sim)

0.40206002214242403
0.8585334221718391


In [57]:
def jaccard(t1, t2,stopwords=[]): 
    tokenize = lambda t: set([w for w in t.split() if (w not in stopwords)])
    t1, t2 = tokenize(t1), tokenize(t2)
    c = t1.intersection(t2)
    return len(c) / (len(t1) + len(t2) - len(c))


def dice(t1, t2, stopwords=[]):
    tokenize = lambda t: set([w for w in t.split() if (w not in stopwords)])
    t1, t2 = tokenize(t1), tokenize(t2)
    return 2. * len(t1.intersection(t2)) / (len(t1) + len(t2))

def fd(counts):
    '''Given a list of occurrences (e.g., [1,1,1,2]), return a dictionary of frequencies (e.g., {1:3, 2:1}.)'''
    d = {}
    for i in counts: d[i] = d[i] + 1 if i in d else 1
    return d


freq_rank = lambda d: sorted(d, key=d.get, reverse=True)
'''Given a map, return ranked the keys based on their values.'''


def fd2(counts):
    '''Given a list of 2-uplets (e.g., [(a,pos), (a,pos), (a,neg), ...]), form a dict of frequencies of specific items (e.g., {a:{pos:2, neg:1}, ...}).'''
    d = {}
    for i in counts:
        # If the first element of the 2-uplet is not in the map, add it.
        if i[0] in d:
            if i[1] in d[i[0]]:
                d[i[0]][i[1]] += 1
            else:
                d[i[0]][i[1]] = 1
        else:
            d[i[0]] = {i[1]: 1}
    return d

def replacewith(input_str, pattern, replaceWith): 
    return input_str.replace(pattern, replaceWith) 

In [58]:
def extract_data(data_file,polarity_map,max_len=80):
    data = open(data_file,'r').readlines()
    corpus=[]
    for i in range(0,len(data),3):
        temp={}
        if len(data[i].split("\n")[0])>=max_len:
            temp["sentence"] = data[i].split("\n")[0]
            temp["aspect_term"] = data[i+1].split("\n")[0]
            temp["polarity"] = polarity_map[data[i+2].split("\n")[0]]
            corpus.append(temp)
    return corpus 


In [59]:
polarity_map={"1":"positive","-1":"negative","0":"neutral"}
train_data=script_path+"/rest_2014_train.txt"
test_data=script_path+"/rest_2014_test.txt"
train= extract_data(train_data,polarity_map,max_len=0)
test = extract_data(test_data,polarity_map,max_len=0)
print("Number of training sentences: ",len(train))
print("Number of testing sentences: ",len(test))

Number of training sentences:  3699
Number of testing sentences:  1134


In [60]:
correct=[i["aspect_term"].lower() for i in test]
predicted=[]
train_aspect=[i["aspect_term"].lower() for i in train]
for te_sent in test:
    temp=[]
    replaced_test_sent = replacewith(te_sent["sentence"],"$T$",te_sent["aspect_term"])
    for aspect in train_aspect:
        if aspect in replaced_test_sent:
            temp.append(aspect)
    if not temp:
        temp.append("$?$") ##takes care of unknown aspects which are not present in train aspect list
    predicted.append(temp)
for x in range(len(predicted)):
    for aspect in predicted[x]:
        if aspect==correct[x]:
            predicted[x]=aspect
            break

for x in range(len(predicted)):
    if type(predicted[x])==list:
        predicted[x] = "$?$"

print("ASPECT EXTRACTION .....")
print("Accuracy={:.6f}".format(accuracy_score(correct, predicted)))

ASPECT EXTRACTION .....
Accuracy=0.642857


In [61]:
train_aspect_polarity_freq=fd2([(i["aspect_term"],i["polarity"]) for i in train])
most_frequent_polarity = freq_rank(fd([i["polarity"] for i in train]))[0]
#print(most_frequent_polarity)
#print(train_aspect_polarity_freq)
params = [train_aspect_polarity_freq,most_frequent_polarity]

In [62]:
def retrieved_topk(tr_data,sentence,aspect,k,label,sim_measure):
    if label=="None":
        neighbors = dict([(index,sim_measure(sentence,replacewith(tr_sent["sentence"],"$T$",tr_sent["aspect_term"]),stopwords)) 
                      for index,tr_sent in enumerate(tr_data) if aspect.lower()==tr_sent["aspect_term"].lower()])
    else:
        neighbors = dict([(index,sim_measure([Sentence(sentence)],[Sentence(replacewith(tr_sent["sentence"],"$T$",tr_sent["aspect_term"]))])[0]) 
                      for index,tr_sent in enumerate(tr_data) if aspect.lower()==tr_sent["aspect_term"].lower()])
    ranked = freq_rank(neighbors)
    topk = [tr_data[i] for i in ranked[:k]]
    return freq_rank(fd([i["polarity"] for i in topk]))

def polarity_determination(tr_data,params,sentence,aspect,label,sim_measure,k=5):
    train_aspect_polarity_freq=params[0]
    most_frequent_polarity=params[1]
    train_aspect_polarity_freq=fd2([(i["aspect_term"],i["polarity"]) for i in train])
    most_frequent_polarity = freq_rank(fd([i["polarity"] for i in train]))[0]
    if aspect not in train_aspect_polarity_freq:
        return most_frequent_polarity
    else:
        polarities = retrieved_topk(tr_data,sentence,aspect,k,label,sim_measure)
        if polarities:
            return polarities[0]
        else:
            return most_frequent_polarity

In [63]:
correct_polarity=[i["polarity"] for i in test]
ks=[5,6,7,8,9,10,11,12,13,14,15,26,17,18,19,20]
max_acc=0
optimal_k=0
optimal_results=None
sim_measures=[("None",jaccard),
              ("None",dice),
              ("AVG-W2V-TFIDF-STOP", ft.partial(avg_embedding_sim, model=word2vec, use_stoplist=True, doc_freqs=doc_frequencies)),
              ("AVG-GLOVE-TFIDF-STOP", ft.partial(avg_embedding_sim, model=glove, use_stoplist=True, doc_freqs=doc_frequencies))]
for label,sim_measure in sim_measures:    
    for k in ks:    
        predicted_polarity=[]
        for te_sent in test:
            sent = replacewith(te_sent["sentence"],"$T$",te_sent["aspect_term"])
            aspect = te_sent["aspect_term"]
            predicted_polarity.append(polarity_determination(train,params,sent,aspect,label,sim_measure,k))
        acc = accuracy_score(correct_polarity, predicted_polarity)
        if acc>max_acc:
            max_acc=acc
            optimal_k=k
            optimal_results=predicted_polarity
predicted_polarity=optimal_results        

In [64]:
print("ASPECT POLARITY DETECTION .....")
print("Optimal K: ",optimal_k)
p, r, f, _ = score(correct_polarity,predicted_polarity,average=None,labels=["positive","negative","neutral"])
print("P_positive = {:.6f} ,P_negative = {:.6f} ,P_neutral = {:.6f}".format(p[0],p[1],p[2]))
print("R_positive = {:.6f} ,R_negative = {:.6f} ,R_neutral = {:.6f}".format(r[0],r[1],r[2]))
print("F_positive = {:.6f} ,F_negative = {:.6f} ,F_neutral = {:.6f}".format(f[0],f[1],f[2]))
#print("Acc_positive = {:.6f} ,Acc_negative = {:.6f} ,Acc_neutral = {:.6f}".format(acc[0],acc[1],acc[2]))
print("Accuracy={:.6f}".format(accuracy_score(correct_polarity, predicted_polarity)))

ASPECT POLARITY DETECTION .....
Optimal K:  12
P_positive = 0.696162 ,P_negative = 0.481818 ,P_neutral = 0.430233
R_positive = 0.896978 ,R_negative = 0.252381 ,R_neutral = 0.188776
F_positive = 0.783914 ,F_negative = 0.331250 ,F_neutral = 0.262411
Accuracy=0.655203
