In [1]:
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from scipy.stats import pearsonr


In [2]:
#initialise dictionary that will contain the word embeddings
#key=word
#values=array of numerical representation of the corresponding word (vector)
word_embeddings = {}
#open pre-trained word embeddings stored in file
with open("/Users/deeksha/Desktop/Improving-vector-space-representations-using-semantic-resources/data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean", 'r') as file:
    #for each line in the file
    for line in file:
        #split the line into word and vector
        values = line.split()
        #get the first element which is the word
        word = values[0]
        #get the remainder of the values which will form the vector represenation of the word in a matrix
        vector = np.array(values[1:], dtype='float32')
        #assign key=word and values=vector
        word_embeddings[word] = vector

#Access word vectors
#print(word_embeddings['talk'])

In [3]:
#list of word pairs from the lexical similarity file
word_pairs = []
#list of human scores for each word pair
human_scores = []
#open lexical_similarity file
with open ("/Users/deeksha/Desktop/Improving-vector-space-representations-using-semantic-resources/data/English/evaluations/lexical similarity/ws353_lexical_similarity.txt", 'r') as file:
    #for every line in the file
    for line in file:
        #split the line by space
        #one line contains the word pair and its score
        w1, w2, score = line.split()
        #add the words to word_pairs
        word_pairs.append((w1, w2))
        #add the ratings to human_score
        human_scores.append(score)

In [4]:
print(word_pairs[0])
print(human_scores[0])

('love', 'sex')
6.77


In [12]:
#list of cosine similarities
scores = []

#for every word pair
for w1, w1 in word_pairs:
    #check if the words exist in the word_embeddings
    if w1 in word_embeddings and w2 in word_embeddings:
        #retrieve word1's vector 
        vec1 = word_embeddings[w1]
        #retreive word2's vector
        vec2 = word_embeddings[w2]
        #calculate cosine between the two vectors
        similarity = 1 - cosine(vec1, vec2)
        #add the result to scores
        scores.append(round(similarity, 2))
    #otherwise if words don't exist in the word embeddings
    else:
        #assign default value of 0.0
        scores.append(0.00)        

In [13]:
print(human_scores)
print(scores)

['6.77', '7.35', '10.00', '7.46', '7.62', '7.58', '5.77', '6.31', '7.50', '6.77', '7.42', '6.85', '6.19', '5.92', '7.00', '6.62', '6.81', '4.62', '5.81', '7.08', '8.08', '1.62', '1.31', '0.92', '1.81', '6.69', '3.73', '0.92', '7.46', '8.12', '7.73', '9.15', '0.31', '0.23', '8.58', '5.92', '6.69', '8.46', '7.65', '1.62', '9.44', '8.62', '9.03', '6.81', '6.63', '7.56', '6.73', '7.65', '2.50', '8.38', '7.38', '6.19', '6.73', '7.92', '8.12', '7.35', '4.88', '5.54', '8.46', '8.13', '3.04', '1.31', '5.96', '6.87', '7.85', '2.65', '8.94', '8.96', '9.29', '8.83', '9.10', '8.87', '9.02', '9.29', '8.79', '7.52', '7.10', '7.38', '6.46', '6.27', '2.69', '4.46', '5.85', '5.00', '2.08', '4.42', '4.38', '1.85', '3.08', '0.92', '3.15', '0.92', '0.54', '2.08', '0.54', '0.62', '8.42', '9.08', '9.04', '8.27', '7.57', '7.29', '8.50', '7.73', '6.88', '5.65', '3.31', '8.00', '8.00', '7.08', '6.85', '7.00', '4.77', '5.62', '5.87', '8.08', '7.00', '6.85', '7.42', '6.58', '6.42', '8.21', '7.69', '7.23', '6.71'

# Spearman
- a statistical measure to see how 2 things are related to each other (used when the relation between to sets of data is not a straight line, e.g. relatio between height and weight). 

- range from [-1, 1]:
    - if close to 1, then as one var goes up the other var tends to go up too
    - if close to -1, then as one var goes up the other var tends to go down
    - if close to 0, then there's not much of a relation between the two vars


- affected by outliers

In [7]:
#calculate the spearman rank correlation
spearman_correlation, _ = spearmanr(human_scores, scores)
print("Correlation coefficient: ", round(spearman_correlation,3))

Correlation coefficient:  -0.078


# Pearson

- a statistical measure that quantifies the strength and direction of the linear relationshipo between two continuous variables.

- range from [-1, 1]. Same interpretation as Spearman

- p_value (point of reference, usually, O,05) determines how genuine the relation between the two vars are (if they are random or not)
    - high p_value -> relation due to random chance
    - low p_value -> evidence of a significant or real relation

In [16]:
#convert values from string to float
human_scores = [float(value) for value in (human_scores)]
#compute the pearson correlation coefficient and the p_value
pearsonr_correlation, p_value = pearsonr(human_scores, scores)

#print results
print("Correlation coefficient: ", round(pearsonr_correlation, 4))
print("p_value: ", round(p_value, 4))

Correlation coefficient:  -0.0843
p_value:  0.1139
