**Downloads**

In [None]:
!pip install transformers



**Importing libraries**

In [None]:
import numpy as np
import gensim
import matplotlib.pyplot as plt
import torch
import json
import spacy
import re
import nltk
from nltk.corpus import stopwords
import csv
import os
from transformers import BertModel, BertTokenizerFast, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pickle

**Loading dataset**

The **MOSAICo** corpus is a huge dataset where words have WordNet meanings assigned.

In [None]:
class TextDataset(torch.utils.data.Dataset):
  def __init__(self, file):
    self.text, self.annotations = [], []
    self.data = {}
    self.length = 0
    with open(file) as f:
      for line in f:
        self.length += 1
        self.data = json.loads(line)
        self.text.append(self.data["text"])
        self.annotations.append(self.data["annotations"])

  def __getitem__(self, idx):
        return {'text': self.text[idx], 'annotations': self.annotations[idx]}

  def __len__(self):
        return self.length

In [None]:
!gdown https://drive.google.com/uc?id=122i8byMXrGFvPS6YhzAEy-3QRK00ut9v

Downloading...
From: https://drive.google.com/uc?id=122i8byMXrGFvPS6YhzAEy-3QRK00ut9v
To: /content/500000.jsonl
100% 214M/214M [00:02<00:00, 96.4MB/s]


In [None]:
datapath = '500000.jsonl'
dataset = TextDataset(datapath)

In [None]:
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# We remove token 'do' from list of stop words because we want to embed it as it appears in the simlex file
word_to_remove = 'do'
stop_words = [w for w in stop_words if w!= word_to_remove]

In [None]:
def preprocess_text(tokens, stopwords):
  sentence_tokens = []
  for token in tokens:
    if token in stop_words:
      continue
    if token == '.' or token == ',' or token == ':' or token == ';':
      continue
    else:
      sentence_tokens.append(token.lower())
  return sentence_tokens

# **Non-semantic similarity scores**

**Implicit representations : Word embeddings**

I did it using both Word2Vec and BERT.

In [None]:
texts = []
for i in range(len(dataset)):
  text = dataset[i]["text"].split()
  text_clean = preprocess_text(text, stop_words)
  texts.append(text_clean)

*Word2Vec*

In [None]:
model_w2v = gensim.models.Word2Vec(texts, vector_size=100, window=2, min_count=1)
model_w2v

<gensim.models.word2vec.Word2Vec at 0x7c856f8c8550>

In [None]:
model_w2v.wv["man"]

array([ 0.51433545,  1.9825836 , -1.2367504 , -0.03155358,  1.8882501 ,
       -0.99185675,  0.88348275,  1.6463926 , -0.5598825 , -1.6959121 ,
       -0.45444486,  0.8603689 ,  0.85573804, -0.23581806, -0.75187796,
       -1.4608091 ,  0.01139964, -0.06495843, -0.11543689, -0.95571685,
       -0.82969195, -0.6091588 ,  1.7407378 , -0.40332824,  0.51848763,
       -0.17906132,  0.6346073 ,  0.53366315, -0.6141488 ,  1.8230557 ,
        0.85490584, -0.72517747,  0.09773695, -1.6389989 ,  0.7779602 ,
       -0.1866965 ,  1.3741964 , -2.3905058 , -0.46823305,  0.5103933 ,
        0.30532318,  0.7232862 , -0.9767976 , -0.5477164 ,  0.86644804,
       -2.7456799 , -0.34640875,  0.04078872, -0.02895803,  0.96099657,
       -2.265705  , -2.2831826 , -0.7314991 , -0.82157314, -0.29613551,
       -0.53812385,  1.6499604 , -0.09747396,  0.1229334 ,  0.29496455,
        0.21585219,  0.1312177 , -0.7560584 ,  0.5852642 ,  0.06541489,
        0.71863824, -0.27959454,  1.2567848 , -2.2667518 ,  2.44

**Computing the non-semantinc similarity scores for SimLex pairs in tsv**

In [None]:
!gdown https://drive.google.com/uc?id=1MLo9lgav9a2WfgCQ1X_rwVbiqvQ58Wso

Downloading...
From: https://drive.google.com/uc?id=1MLo9lgav9a2WfgCQ1X_rwVbiqvQ58Wso
To: /content/semantic_simlex_v0.1.tsv
  0% 0.00/81.3k [00:00<?, ?B/s]100% 81.3k/81.3k [00:00<00:00, 77.1MB/s]


In [None]:
file_path = 'semantic_simlex_v0.1.tsv'

In [None]:
data_non_semantic_w2v = []
with open(file_path, mode='r', encoding='utf-8') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')

    # Skip first row (titles)
    next(tsv_reader)

    for row in tsv_reader:
        data_non_semantic_w2v.append([row[0], row[1], model_w2v.wv.similarity(row[0], row[1])])

print(data_non_semantic_w2v)

[['glass', 'crystal', 0.61567956], ['task', 'woman', 0.326347], ['house', 'barn', 0.7074768], ['vision', 'perception', 0.7605883], ['box', 'hat', 0.5874753], ['metal', 'aluminum', 0.6597517], ['doctor', 'orthodontist', 0.5861967], ['letter', 'paragraph', 0.60268587], ['man', 'child', 0.61523503], ['home', 'state', 0.37452623], ['mink', 'fur', 0.7003918], ['mother', 'wife', 0.7784088], ['child', 'adult', 0.59009075], ['fee', 'payment', 0.77950954], ['milk', 'juice', 0.7992452], ['cup', 'cone', 0.3101427], ['book', 'article', 0.62433004], ['money', 'capital', 0.39406157], ['body', 'shoulder', 0.48803112], ['cheek', 'tongue', 0.89145154], ['father', 'brother', 0.77707857], ['book', 'information', 0.30688074], ['appointment', 'engagement', 0.6895381], ['gun', 'knife', 0.6919738], ['meat', 'sandwich', 0.7007344], ['birthday', 'year', 0.44065294], ['formula', 'equation', 0.5760287], ['wire', 'cord', 0.7562522], ['car', 'hose', 0.5974884], ['room', 'bath', 0.51611435], ['rice', 'bean', 0.7516

*BERT*

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-cased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Here, we define two functions that return the sentences in the MOSAICo dataset where each word of the pair occurs in order to tokenize them.

In [None]:
def sentence_of_interest1(word1):
  sentence1 = ""
  idx1 = 0
  for d in dataset:
    text = d["text"]
    anns = d["annotations"]

    for a in anns:
      label = a["label"].split("%")[0]
      if word1 == label :
        sentence1 = text.split()
        index1 = a["token_span"][0]
        # Replace the word we are looking for by the first part of the label before "%"
        # e.g. we are looking for the word "crystal" and it appears in a sentence
        # in the MOSAICo corpus as "crystallized", we replace this instance
        # by "crystal".
        sentence1[index1] = label
        sentence1 = ' '.join(sentence1)
        return sentence1
        break

In [None]:
def sentence_of_interest2(word2):
  sentence2 = ""
  idx2 = 0
  for d in dataset:
    text = d["text"]
    anns = d["annotations"]

    for a in anns:
      label = a["label"].split("%")[0]
      if word2 == label :
        sentence2 = text.split()
        index2 = a["token_span"][0]
        # Same reasoning as before
        sentence2[index2] = label
        sentence2 = ' '.join(sentence2)
        return sentence2
        break

In [None]:
# Retrieve index of word of interest
def word_2_indices(encoded, tokenizer):

    w_id2input_ids, w_id_idx, word2_idx = dict(), dict(), dict()

    for idx, (w_id, input_id) in enumerate(zip(encoded.word_ids(), encoded.input_ids[0])):
        w_id2input_ids.setdefault(w_id, []).append(input_id)
        w_id_idx.setdefault(w_id, []).append(idx)

    for w_id, input_ids in w_id2input_ids.items():
        word = tokenizer.decode(input_ids)
        word2_idx[word] = w_id_idx[w_id]

    return word2_idx

# Return the word embedding
def emb(sent, word, tokenizer, model):

    tokens = tokenizer(sent, return_tensors="pt")
    w_2_indices = word_2_indices(tokens, tokenizer)

    with torch.no_grad():
        output = model(**tokens)

    sentence_token_embeddings = output[0][0]
    idx = w_2_indices[word]
    word_emb = sentence_token_embeddings[idx]

    return word_emb.mean(0)

# Compute the cosine similarity score of all the SimLex pairs
def transformers_similarity(tokenizer, model, word1, word2):

    sent1 = sentence_of_interest1(word1)
    sent2 = sentence_of_interest2(word2)

    word_embedding_s1 = emb(sent1, word1, tokenizer, model)
    word_embedding_s2 = emb(sent2, word2, tokenizer, model)

    similarity = torch.cosine_similarity(word_embedding_s1.unsqueeze(0), word_embedding_s2.unsqueeze(0))

    return similarity

In [None]:
data_non_semantic_bert = []
with open(file_path, mode='r', encoding='utf-8') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')

    # Skip first row (titles)
    next(tsv_reader)

    for row in tsv_reader:

          word1 = row[0]
          word2 = row[1]

          similarity = transformers_similarity(tokenizer, model_bert, word1, word2)
          data_non_semantic_bert.append([word1, word2, similarity[0].item()])

In [None]:
print(*data_non_semantic_bert, sep='\n')

['glass', 'crystal', 0.6473640203475952]
['task', 'woman', 0.5004521608352661]
['house', 'barn', 0.5596182346343994]
['vision', 'perception', 0.6156847476959229]
['box', 'hat', 0.2594585716724396]
['metal', 'aluminum', 0.6829593777656555]
['doctor', 'orthodontist', 0.6218967437744141]
['letter', 'paragraph', 0.5000917315483093]
['man', 'child', 0.6398710608482361]
['home', 'state', 0.37017297744750977]
['mink', 'fur', 0.5925835967063904]
['mother', 'wife', 0.6575551629066467]
['child', 'adult', 0.6409657001495361]
['fee', 'payment', 0.7264203429222107]
['milk', 'juice', 0.6720603108406067]
['cup', 'cone', 0.4611589014530182]
['book', 'article', 0.3595791161060333]
['money', 'capital', 0.4838293492794037]
['body', 'shoulder', 0.45378583669662476]
['cheek', 'tongue', 0.473917156457901]
['father', 'brother', 0.6990193724632263]
['book', 'information', 0.5110917687416077]
['appointment', 'engagement', 0.5843856930732727]
['gun', 'knife', 0.6501515507698059]
['meat', 'sandwich', 0.512066006

# **Semantic similarity scores**

**Replacing the instances of the words by their senses**

In [None]:
texts_s = []
for i in range(len(dataset)):
  text = dataset[i]["text"].split()
  anns = dataset[i]["annotations"]
  tokens = []
  for j in range(len(anns)):
    idx = anns[j]["token_span"][0]
    text[idx] = anns[j]["label"]
  text_clean = preprocess_text(text, stop_words)
  texts_s.append(text_clean)

**Implicit representations : Word embeddings**

*Word2Vec*

In [None]:
model_w2v_s = gensim.models.Word2Vec(texts_s, vector_size=100, window=2, min_count=1)
model_w2v_s

<gensim.models.word2vec.Word2Vec at 0x7c853e499f00>

In [None]:
model_w2v_s.wv["man%1:18:00::"]

array([ 1.0447305 ,  0.3882707 ,  1.1181595 , -1.0547829 ,  0.1728546 ,
       -1.2809653 ,  1.0901692 ,  1.5795207 , -1.2183943 , -1.1325037 ,
       -1.8029039 , -0.84373313, -1.8844091 , -1.437805  , -0.12404358,
       -0.9791507 ,  0.40984732, -0.7654517 , -0.50455993, -2.726561  ,
        0.971814  ,  0.5493244 ,  0.8700531 , -2.262702  , -0.862935  ,
        2.0853646 ,  1.2078762 , -0.55333847,  1.1838397 , -0.22956842,
       -2.5225337 ,  1.7876759 ,  0.8009026 , -0.62129956, -1.623608  ,
        1.015753  ,  1.3345269 , -0.6156885 , -1.3344742 , -0.35637736,
        0.37224853, -2.0839093 , -0.75435776, -0.47556296,  0.87446624,
       -0.76050705, -0.30934533, -2.182277  ,  0.6019528 ,  1.9043142 ,
        0.12041044, -1.5027583 , -0.13103299, -1.0200696 , -0.3076843 ,
       -0.51936305,  1.658925  ,  1.6060387 ,  0.03833958,  0.56016713,
        2.0482373 ,  0.324805  , -1.897655  , -0.09479728, -0.29158178,
        0.79312456,  0.97482216, -1.0153754 ,  1.1948526 , -0.10

**Computing the semantinc similarity scores for SimLex pairs in tsv**

In [None]:
data_semantic_w2v = []
with open(file_path, mode='r', encoding='utf-8') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')

    # Skip first row (titles)
    next(tsv_reader)

    for row in tsv_reader:
        # BOTH word1 and word2 have multiple senses in the simlex file
        if row[10].find(",") != -1 and row[11].find(",") != -1:
          poss_senses_1 = row[10].split(",")
          poss_senses_2 = row[11].split(",")
          scores = []
          for s1 in poss_senses_1:
            for s2 in poss_senses_2:
              scores.append(model_w2v_s.wv.similarity(s1, s2))
          max_score = max(scores)
        # ONLY word1 has multiple senses in the simlex file
        elif row[10].find(",") != -1 and row[11].find(",") == -1:
          poss_senses_1 = row[10].split(",")
          scores = []
          for s1 in poss_senses_1:
            scores.append(model_w2v_s.wv.similarity(s1, row[11]))
          max_score = max(scores)
        # ONLY word2 has multiple senses in the simlex file
        elif row[10].find(",") == -1 and row[11].find(",") != -1:
          poss_senses_2 = row[11].split(",")
          scores = []
          for s2 in poss_senses_2:
            scores.append(model_w2v_s.wv.similarity(row[10], s2))
          max_score = max(scores)
        # BOTH word1 and word2 have unique sense in the simlex file
        else:
          max_score = model_w2v_s.wv.similarity(row[10], row[11])

        data_semantic_w2v.append([row[0], row[1], max_score])
print(data_semantic_w2v)

[['glass', 'crystal', 0.6640031], ['task', 'woman', 0.41734973], ['house', 'barn', 0.7992415], ['vision', 'perception', 0.908933], ['box', 'hat', 0.62152255], ['metal', 'aluminum', 0.899394], ['doctor', 'orthodontist', 0.5955776], ['letter', 'paragraph', 0.53922975], ['man', 'child', 0.6226195], ['home', 'state', 0.45719573], ['mink', 'fur', 0.43363044], ['mother', 'wife', 0.77564], ['child', 'adult', 0.81381625], ['fee', 'payment', 0.8595953], ['milk', 'juice', 0.8227307], ['cup', 'cone', 0.89995635], ['book', 'article', 0.7686964], ['money', 'capital', 0.44258174], ['body', 'shoulder', 0.6722199], ['cheek', 'tongue', 0.89984655], ['father', 'brother', 0.75153315], ['book', 'information', 0.30121112], ['appointment', 'engagement', 0.75066316], ['gun', 'knife', 0.8146709], ['meat', 'sandwich', 0.7672636], ['birthday', 'year', 0.44097367], ['formula', 'equation', 0.87017095], ['wire', 'cord', 0.8660561], ['car', 'hose', 0.5683614], ['room', 'bath', 0.7180411], ['rice', 'bean', 0.9263315

# **Generating the tsv files**

In [None]:
file_path_non_semantic = 'non_semantic.tsv'
with open(file_path_non_semantic, mode='w', newline='', encoding='utf-8') as tsv_file:
    # Create a TSV writer object
    tsv_writer = csv.writer(tsv_file, delimiter='\t')

    # Write data to the TSV file
    for row in data_non_semantic_w2v:
        tsv_file.write('\t'.join(map(str, row)) + '\n')

print(f"TSV file '{file_path_non_semantic}' has been generated.")

TSV file 'non_semantic.tsv' has been generated.


In [None]:
file_path_semantic = 'semantic.tsv'
with open(file_path_semantic, mode='w', newline='', encoding='utf-8') as tsv_file:
    # Create a TSV writer object
    tsv_writer = csv.writer(tsv_file, delimiter='\t')

    # Write data to the TSV file
    for row in data_semantic_w2v:
        tsv_file.write('\t'.join(map(str, row)) + '\n')

print(f"TSV file '{file_path_semantic}' has been generated.")

TSV file 'semantic.tsv' has been generated.


# **Explicit representation**

In [None]:
corpus = []
for i in range(len(dataset)):

  corpus.append(dataset[i]["text"])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances


def compute_ppmi(word1, word2):
  # Convert a collection of text documents to a matrix of token counts.
  vectorizer = CountVectorizer(analyzer='word', lowercase=True)
  t = 10000
  X = vectorizer.fit_transform(corpus[0:t])

  idx1 = np.where(vectorizer.get_feature_names_out() == word1)[0][0]
  idx2 = np.where(vectorizer.get_feature_names_out() == word2)[0][0]
  count1 = 0
  count2 = 0
  for i in range(t):
   count1 += X.toarray()[i][idx1]
   count2 += X.toarray()[i][idx2]

  co_occurrence_count = 0
  for i in range(t):
    if X.toarray()[i][idx1] == 1 and X.toarray()[i][idx2] == 1 :
      co_occurrence_count += 1

  prob_occ1 = count1/len(vectorizer.get_feature_names_out())
  prob_occ2 = count2/len(vectorizer.get_feature_names_out())
  prob_co_occurrence = co_occurrence_count/t

  ppmi_value = np.log2(prob_co_occurrence/(prob_occ1*prob_occ2))

  ppmi_value = max(ppmi_value, 0)

  return ppmi_value

In [None]:
print(compute_ppmi("birthday", "man"))

12.178988456393446


Only problem here is using the whole corpus of MOSAICo or even a big number of sentences makes my code crash every single time (I think there is not enough space for it to run) and if I use a small number of sentences, since some words from the pairs are not seen in the corpus, I get an error. For that I just showed you my reasoning.

In [None]:
"""data_explicit = []
with open(file_path, mode='r', encoding='utf-8') as tsv_file:
    tsv_reader = csv.reader(tsv_file, delimiter='\t')

    # Skip first row (titles)
    next(tsv_reader)

    for row in tsv_reader:
        data_explicit.append([row[0], row[1], compute_ppmi(row[0], row[1])])

print(data_explicit)"""