_S25: Computational Psycholinguistics_

Team Project

Swarang Joshi, 2022114010

Pranav Agarwal, 2021113018

# Compund Analysis: Glove

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
import torch

In [None]:
# !pip install torch torchvision torchaudio

In [None]:
def glove_vectors(list_of_words):

    file_name = "./data/glove.6B.300d.txt"

    words_to_vectors = {}
    with open(file_name, "r", encoding="utf8") as f:
        for line in f.readlines():
            splitted = line.split(" ")
            word = splitted[0]
            vector = splitted[1:]

            if word in list_of_words:
                words_to_vectors[word] = np.array([float(v) for v in vector])

    return words_to_vectors


## Loading Compund Words Dataset

In [None]:
compounds_df = pd.read_csv("./data/compounds.csv")
compounds_df = compounds_df.drop(compounds_df.columns[0], axis=1)
compounds_df

Unnamed: 0,Compound,left,right,TRAN,LMD
0,aboveground,above,ground,6.200000,4.000000
1,airbase,air,base,4.266667,5.866667
2,airborne,air,borne,3.857143,3.133333
3,aircraft,air,craft,4.533333,4.266667
4,airfield,air,field,4.333333,5.714286
...,...,...,...,...,...
623,wordplay,word,play,4.928571,3.533333
624,worldwide,world,wide,4.928571,3.066667
625,wristwatch,wrist,watch,6.266667,6.333333
626,yardstick,yards,tick,5.866667,5.333333


## Getting Word Embeddings

In [None]:
words_to_vectors = glove_vectors(
    compounds_df["Compound"].tolist()+
    compounds_df["left"].tolist()+
    compounds_df["right"].tolist()
)
len(words_to_vectors)

1247

### Remove word set not found

In [None]:
# Remove compounds that are not in GloVe
compounds_df = compounds_df[compounds_df["Compound"].isin(words_to_vectors.keys())]
compounds_df = compounds_df.reset_index(drop=True)
compounds_df = compounds_df[compounds_df["left"].isin(words_to_vectors.keys())]
compounds_df = compounds_df.reset_index(drop=True)
compounds_df = compounds_df[compounds_df["right"].isin(words_to_vectors.keys())]
compounds_df = compounds_df.reset_index(drop=True)
compounds_df

Unnamed: 0,Compound,left,right,TRAN,LMD
0,aboveground,above,ground,6.200000,4.000000
1,airbase,air,base,4.266667,5.866667
2,airborne,air,borne,3.857143,3.133333
3,aircraft,air,craft,4.533333,4.266667
4,airfield,air,field,4.333333,5.714286
...,...,...,...,...,...
618,wordplay,word,play,4.928571,3.533333
619,worldwide,world,wide,4.928571,3.066667
620,wristwatch,wrist,watch,6.266667,6.333333
621,yardstick,yards,tick,5.866667,5.333333


## Getting Cosine Similarity of L and R lexems

In [None]:
compounds_glove_embeddings = np.array([
    words_to_vectors[compound] for compound in compounds_df["Compound"]
])
left_glove_embeddings = np.array([
    words_to_vectors[compound] for compound in compounds_df["left"]
])
right_glove_embeddings = np.array([
    words_to_vectors[compound] for compound in compounds_df["right"]
])
print(compounds_glove_embeddings.shape)
print(left_glove_embeddings.shape)
print(right_glove_embeddings.shape)

(623, 300)
(623, 300)
(623, 300)


### Computing Cosine Similarities

In [None]:
# Compute cosine similarity of left and right embeddings with compound embeddings
def cosine_similarity(a, b):
    # Compute dot product
    dot_product = np.sum(a * b, axis=1)
    
    # Compute magnitudes
    norm_a = np.linalg.norm(a, axis=1)
    norm_b = np.linalg.norm(b, axis=1)
    
    # Compute cosine similarity
    cosine_sim = dot_product / (norm_a * norm_b)

    return cosine_sim


L_cosine = cosine_similarity(left_glove_embeddings, compounds_glove_embeddings)
R_cosine = cosine_similarity(right_glove_embeddings, compounds_glove_embeddings)

print(L_cosine.shape)
print(R_cosine.shape)

# Resetting the range of the cosine similarity values to be between 0 and 1
# L_cosine = (L_cosine + 1) / 2
# R_cosine = (R_cosine + 1) / 2
# L_cosine = np.abs(L_cosine)
# R_cosine = np.abs(R_cosine)

(623,)
(623,)


## Obtaining LMD and ST

In [None]:
def calc_LMD(L,R):
    return 5*(R - L) + 5

glove_LMD = calc_LMD(L_cosine, R_cosine)

In [None]:
def calc_ST(L,R):
    return 3*(R + L) + 1

glove_ST = calc_ST(L_cosine, R_cosine)

## Evaluating

In [None]:
# Calculate the MAE and Spearman correlation
mea_LMD = mean_absolute_error(compounds_df["LMD"], glove_LMD)
spearman_LMD = spearmanr(compounds_df["LMD"], glove_LMD)[0]
mea_ST = mean_absolute_error(compounds_df["TRAN"], glove_ST)
spearman_ST = spearmanr(compounds_df["TRAN"], glove_ST)[0]

print("LMD MAE:", mea_LMD)
print("LMD Spearman:", spearman_LMD)
print("ST MAE:", mea_ST)
print("ST Spearman:", spearman_ST)

LMD MAE: 0.9850749358895161
LMD Spearman: 0.47257485753044226
ST MAE: 2.4924816662663165
ST Spearman: 0.2685460636195837


## Saving the Computed Values

In [None]:
# Save the computed values to a CSV file
compounds_df["glove_LMD"] = glove_LMD
compounds_df["glove_TRAN"] = glove_ST

compounds_df.to_csv("./data/compounds_glove.csv", index=False)
