In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2022-12-08 14:04:22--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-12-08 14:04:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-12-08 14:04:23--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
import numpy as np
from typing import List
import math
import pandas as pd
import pickle

In [None]:
word_frequency = dict()
avg_frequency = 1.0
with open('word-frequency-list.txt', 'rt') as reader:
    max_value = 0.0
    counter = 0
    for line in reader:
        line = line.strip().split(' ')
        if len(line) == 2:
            value = math.log2(float(line[1]))
            avg_frequency += value
            counter += 1
            word_frequency[line[0].lower()] = value
    avg_frequency /= counter


# an embedding word with associated vector
class Word:
    def __init__(self, text, vector):
        self.text = text
        self.vector = vector

    def __str__(self):
        return self.text + ' : ' + str(self.vector)

    def __repr__(self):
        return self.__str__()


# a sentence, a list of words
class Sentence:
    def __init__(self, word_list):
        self.word_list = word_list

    # return the length of a sentence
    def len(self) -> int:
        return len(self.word_list)

    def __str__(self):
        word_str_list = [word.text for word in self.word_list]
        return ' '.join(word_str_list)

    def __repr__(self):
        return self.__str__()


# return a typical frequency for a word from Google's n-grams
def get_word_frequency(word_text):
    if word_text.lower() in word_frequency:
        return word_frequency[word_text.lower()]
    else:
        return avg_frequency


# convert a list of sentence with glove vectors into a set of sentence vectors
def sentence_to_vec(sentence_list: List[Sentence], embedding_size: int):
    if len(sentence_list) == 0:
        return []
    sentence_set = []
    delta = 0.001  # small value to avoid division by 0
    for sentence in sentence_list:
        vs = np.zeros(embedding_size)  # add all glove values into one vector for the sentence
        sentence_length = 0.0
        for word in sentence.word_list:
            # basically the importance of a word becomes less the more frequent it is
            a_value = delta / (delta + get_word_frequency(word.text))  # smooth inverse frequency, SIF
            sentence_length += a_value
            vs = np.add(vs, np.multiply(a_value, word.vector))  # vs += sif * word_vector

        if sentence_length != 0.0:
            vs = np.divide(vs, sentence_length)  # weighted average
        sentence_set.append(vs)  # add to our existing re-calculated set of sentences

    return sentence_set

In [None]:
# inner product of two vectors
def inner_product(v1, v2):
    if len(v1) == len(v2):
        sum = 0.0
        size_v1 = 0.0
        size_v2 = 0.0
        for i in range(len(v1)):
            size_v1 += v1[i] * v1[i]
            size_v2 += v2[i] * v2[i]
            sum += v1[i] * v2[i]
        size_v1 = math.sqrt(size_v1)
        size_v2 = math.sqrt(size_v2)
        size_mult = size_v1 * size_v2
        if size_mult != 0.0:
            return round(sum / size_mult, 4)
    return 0.0


glove_file = 'glove.6B.300d.txt'
embedding_size = 300   # dimension of glove, must match glove_file size (50 = 50d, 100 = 100d, etc)

# load the glove set from file
glove_300_dict = dict()
with open(glove_file, 'rt') as reader:
    for line in reader:
        line = line.strip().split(' ')
        if len(line) == (embedding_size + 1):
            word = line[0]
            vector = [float(item) for item in line[1:]]
            glove_300_dict[word] = vector

In [None]:
# d1 = pd.read_json('/content/semeval-task3-homo.json', orient='records')
d2 = pd.read_json('/content/semeval-task3-hetero.json', orient='records')

In [None]:
d1 = d1[['sentence']]
d1 = d1.replace(' \- ', '-', regex=True)
d1 = d1.replace(' \' ', "'", regex=True)
d1 = d1.replace(' \. ', '. ', regex=True)
d1 = d1.replace(' \? ', '? ', regex=True)
d1 = d1.replace(' \! ', '! ', regex=True)
d1 = d1.replace(' \, ', ', ', regex=True)
d1.head()

Unnamed: 0,sentence
0,Wal-Mart isn't the only saving place !
1,Can honeybee abuse lead to a sting operation ?
2,A ditch digger was entrenched in his career .
3,"She was only a Blacksmith's daughter, but she ..."
4,Did you hear about the new pinata? It's a huge...


In [None]:
d2 = d2[['sentence']]
d2 = d2.replace(' \- ', '-', regex=True)
d2 = d2.replace(' \' ', "'", regex=True)
d2 = d2.replace(' \. ', '. ', regex=True)
d2 = d2.replace(' \? ', '? ', regex=True)
d2 = d2.replace(' \! ', '! ', regex=True)
d2 = d2.replace(' \, ', ', ', regex=True)
d2.head()

Unnamed: 0,sentence
0,"''I'm halfway up a mountain ,'' Tom alleged ."
1,"I'd like to be a Chinese laborer, said Tom coo..."
2,Dentists don't like a hard day at the orifice .
3,Are evil wildebeests bad gnus ?
4,A busy barber is quite harried .


In [None]:
df = pd.concat([d2])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098 entries, 0 to 1097
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1098 non-null   object
dtypes: object(1)
memory usage: 8.7+ KB


In [None]:
df.to_json('semeval-clean.json', orient='records')

In [None]:
df = pd.read_json('semeval-clean.json', orient='records')

In [None]:
data = df['sentence'].tolist()
len(data)

1098

In [None]:
sentences = data

In [None]:
# convert the above sentences to vectors using spacy's large model vectors
sentence_list = []
for sentence in sentences:
    word_list = []
    for word in sentence.split(' '):
        if word.lower() in glove_300_dict:  # ignore OOVs
            word_list.append(Word(word, glove_300_dict[word.lower()]))
    if len(word_list) > 0:  # did we find any words (not an empty set)
        sentence_list.append(Sentence(word_list))

# apply single sentence word embedding
sentence_vector_lookup = dict()
sentence_vectors = sentence_to_vec(sentence_list, embedding_size)  # all vectors converted together
if len(sentence_vectors) == len(sentence_list):
    for i in range(len(sentence_vectors)):
        # map: text of the sentence -> vector
        sentence_vector_lookup[sentence_list[i]] = sentence_vectors[i]

In [None]:
print(list(sentence_vector_lookup.keys())[0])
print(list(sentence_vector_lookup.values())[0])

In [None]:
dbfile = open('semeval-sent-vectors-hetero.pkl', 'ab')
pickle.dump(sentence_vector_lookup, dbfile)                     
dbfile.close()

In [None]:
dbfile_read = open('semeval-sent-vectors-hetero.pkl', 'rb')     
puns = pickle.load(dbfile_read)
dbfile_read.close()

In [None]:
word = 'poison'

In [None]:
best_match = ''
best_score = 0.0
for text, vector in puns.items():
    match = inner_product(vector, glove_300_dict[word.lower()])
    if match > best_score:
        best_score = match
        best_match = text
print("best match \"{}\" => \"{}\" (score {})".format(word, best_match, str(best_score)))

best match "poison" => "The Chinese chef maliciously dumped a hot broth with dumplings on an obnoxious It was a wanton soup attack ." (score 0.3155)


In [None]:
score_list = []
pun_list = []
pun_return_list = []
for text, vector in puns.items():
    score = inner_product(vector, glove_300_dict[word.lower()])
    score_list.append(score)
    pun_list.append(text)
score_np = np.array(score_list)
idxs = np.argpartition(score_np, -5)[-5:]
idxs = idxs[np.argsort([score_list[int(idx)] for idx in idxs])]
for idx in idxs:
  pun_return_list.append(pun_list[int(idx)])
pun_return_list.reverse()
print(pun_return_list)

[The Chinese chef maliciously dumped a hot broth with dumplings on an obnoxious It was a wanton soup attack ., Exposure to the Son may prevent burning, Upon discovering the deadly virus carried by Surinamese the herpetologist contemplated how best to protect the pipal ., Drinking too much of a certain potent potable may require a leave of absinthe ., Exposure to the Son prevents burning]
