# GloVe (Gensim)

For looking at word vectors, we'll use **Gensim**. **Gensim** isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.   We gonna use **GloVe** embeddings, downloaded at [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

In [None]:
import numpy as np
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


In [None]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import os
import os

os.chdir('/content/drive/MyDrive/_NLP/NLP-A1-That-s-What-I-LIKE-st125553')

In [None]:
# download glove.6B.100d.txt
import kagglehub

# Download latest version
path = kagglehub.dataset_download("danielwillgeorge/glove6b100dtxt")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/danielwillgeorge/glove6b100dtxt?dataset_version_number=1...


100%|██████████| 131M/131M [00:01<00:00, 117MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/danielwillgeorge/glove6b100dtxt/versions/1


In [None]:
#you have to put this file in some python/gensim directory; just run it and it will inform where to put....

glove_file = datapath('/root/.cache/kagglehub/datasets/danielwillgeorge/glove6b100dtxt/versions/1/glove.6B.100d.txt')  #search on the google
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [None]:
#return the vectors
model['coffee'].shape

(100,)

## Testing

### Semantic Test

In [None]:
semantic_file = "data/word-test-semantic.txt"
# open file
with open(semantic_file, "r") as file:
    sem_file = file.readlines()
    #send semantic into vector

semantic = []
for sent in sem_file:
    semantic.append(sent.strip())

#semantic

In [None]:
sem_count = len(semantic)
sem_correct = 0
#sem_total

for sent in semantic:
    sent = sent.lower()
    words = sent.split(" ")

    try:
        result = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0][0]
    except:
        result = "<UNK>"

    if result == words[3]:
        sem_correct += 1

In [None]:
sem_accuracy = sem_correct / sem_count
print(f"Semantic accuracy: {sem_accuracy:2.2f}")
print(f"Semantic correct: {sem_correct}")
print(f"Semantic count: {sem_count}")

Semantic accuracy: 0.53
Semantic correct: 269
Semantic count: 506


### Syntatic Test

In [None]:
syntatic_file = "data/word-test-syntatic.txt"
# open file
with open(syntatic_file, "r") as file:
    syn_file = file.readlines()

syntatic = []
for sent in syn_file:
    syntatic.append(sent.strip())
#syntatic

In [None]:
syn_count = len(syntatic)
syn_correct = 0

for sent in syntatic:
    sent = sent.lower()
    words = sent.split(" ")

    try:
        result = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0][0]
    except:
        result = "<UNK>"

    if result == words[3]:
        syn_correct += 1

In [None]:
syn_accuracy = syn_correct / syn_count
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")
print(f"Syntatic correct: {syn_correct}")
print(f"Syntatic count: {syn_count}")

Syntatic accuracy: 0.55
Syntatic correct: 865
Syntatic count: 1560


### Similarity Test


In [None]:
similarity_file = "data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt"
# open file
with open(similarity_file, "r") as file:
    sim_file = file.readlines()

similarity = []
for sent in sim_file:
    similarity.append(sent.strip())
#syntatic

In [None]:
# default_vector = np.zeros(model.vector_size)
# len(default_vector)

100

In [None]:
# def similarity_test(model, test_data):
#     words = test_data.split("\t")

#     embed0 = np.array(model.get_vector(words[0].strip()))
#     embed1 = np.array(model.get_vector(words[1].strip()))

#     model_result = embed1 @ embed0.T
#     sim_result = float(words[2].strip())

#     return sim_result, model_result

In [None]:
default_vector = np.zeros(model.vector_size)

def similarity_test(model, test_data):
    words = test_data.lower().split("\t")

    default_vector = np.zeros(model.vector_size)
    try:
        embed0 = model.get_vector(words[0].strip())
        embed1 = model.get_vector(words[1].strip())
    except:
        embed0 = default_vector
        embed1 = default_vector


    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [None]:
sim_scores = []
model_scores = []
for sent in similarity:
    sim_result, model_result = similarity_test(model, sent)

    sim_scores.append(sim_result)
    model_scores.append(model_result)

In [None]:
print(sim_result)
print(model_result)

0.23
1.4002881


In [None]:
from scipy.stats import spearmanr

corr = spearmanr(sim_scores, model_scores)[0]

print(f"The correlation result is {corr:2.4f}.")

The correlation result is 0.5431.


## Test P Value

In [None]:
import numpy as np
from scipy.stats import spearmanr

def compute_cosine_similarity(model, word1, word2):
    """Compute cosine similarity between word embeddings of two words."""
    try:
        vec1 = model.get_vector(word1)
        vec2 = model.get_vector(word2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    except KeyError:
        # Handle missing words in the model
        return 0.0

def compute_model_similarities(model, data):
    """Compute similarities using the model."""
    similarities = []
    for _, row in data.iterrows():
        similarity = compute_cosine_similarity(model, row['word1'], row['word2'])
        similarities.append(similarity)
    return similarities

In [None]:
import pandas as pd

file_path = "data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt"
similarity_data = pd.read_csv(file_path, sep='\t', names=['word1', 'word2', 'similarity'])

# Display sample data
print(similarity_data.head())


        word1  word2  similarity
0       tiger    cat        7.35
1       tiger  tiger       10.00
2       plane    car        5.77
3       train    car        6.31
4  television  radio        6.77


In [None]:
gold_standard_similarities = similarity_data['similarity'].values
model_similarities = compute_model_similarities(model, similarity_data)

# Compute Spearman correlation
correlation, p_value = spearmanr(model_similarities, gold_standard_similarities)

print(f"Spearman correlation (GloVe Gensim): {correlation:.4f}")
print(f"P-value: {p_value:.4e}")

Spearman correlation (GloVe Gensim): 0.5800
P-value: 1.2167e-19


## Save the result

In [None]:
import pickle

# Save the model
pickle.dump(model,open('app/models/gensim.model','wb'))

load_model = pickle.load(open('app/models/gensim.model', 'rb'))
load_model.most_similar('james')

[('james', 0.8570922017097473),
 ('george', 0.8181617259979248),
 ('thomas', 0.8109301328659058),
 ('william', 0.8084547519683838),
 ('paul', 0.8058123588562012),
 ('henry', 0.7886716723442078),
 ('edward', 0.7804422378540039),
 ('peter', 0.7743206024169922),
 ('richard', 0.7710520625114441),
 ('robert', 0.767145037651062)]

## Calculate MSE

In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd
from scipy.stats import spearmanr
import numpy as np

# Load the dataset
file_path = "data/wordsim353_sim_rel/wordsim_similarity_goldstandard.txt"
similarity_data = pd.read_csv(file_path, sep='\t', names=['word1', 'word2', 'similarity'])

def compute_dot_product(model, word1, word2):
    """Compute dot product between embeddings of two words."""
    try:
        vec1 = model.get_vector(word1)
        vec2 = model.get_vector(word2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    except KeyError:
        # Handle missing words in the model
        return 0.0
    if embedding1 is None or embedding2 is None:
        return 0.0  # Default to zero for missing embeddings
    return np.dot(np.array(embedding1), np.array(embedding2))

# function to calculate the similarities
def compute_model_similarities(model, data):
    """Compute similarities using the model."""
    model_similarities = []
    for _, row in data.iterrows():
        dot_product = compute_dot_product(model, row['word1'], row['word2'])
        model_similarities.append(dot_product)
    return model_similarities

# Prepare data for word embedding models
gold_standard_similarities = similarity_data['similarity'].values  # Y-true
model_similarities = compute_model_similarities(model, similarity_data)

# Calculate Spearman's rank correlation
correlation, p_value = spearmanr(model_similarities, gold_standard_similarities)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(gold_standard_similarities, model_similarities)
ytrue_mse = mean_squared_error(gold_standard_similarities, gold_standard_similarities)

# Output results
print(f"Spearman correlation: {correlation:.4f}, p-value: {p_value:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}, Yture MSE: {ytrue_mse:.4f}")

# Calculate correlation for Y-True
ytrue_correlation, ytrue_p_value = spearmanr(gold_standard_similarities, gold_standard_similarities)
print(f"Correlation for Y-True: {ytrue_correlation:.4f}, p-value: {ytrue_p_value:.4f}")


Spearman correlation: 0.5800, p-value: 0.0000
Mean Squared Error (MSE): 27.8081, Yture MSE: 0.0000
Correlation for Y-True: 1.0000, p-value: 0.0000
