In [26]:
import numpy as np
import pandas as pd
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [28]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [29]:
words = [
    "CRAB", "RAY", "SPONGE", "SQUID", 
    "CIRCLE", "DIAMOND", "SQUARE", "TRIANGLE",
    "BOB", "CROSS", "HOOK", "WEAVE",
    "FEAST", "FREE", "PANTS", "THAT"
]

In [30]:
dict_df["Word"] = dict_df["Word"].str.upper()

In [31]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,17498,BOB,n.,Anything that hangs so as to play loosely or w...
1,17499,BOB,n.,A knot of worms or of rags on a string used in...
2,17500,BOB,n.,A small piece of cork or light wood attached t...
3,17501,BOB,n.,The ball or heavy part of a pendulum; also the...
4,17502,BOB,n.,A small wheel made of leather with rounded edg...
...,...,...,...,...
202,171769,WEAVE,v. t.,To unite as threads of any kind in such a mann...
203,171770,WEAVE,v. t.,To form as cloth by interlacing threads; to co...
204,171771,WEAVE,v. i.,To practice weaving; to work with a loom.
205,171772,WEAVE,v. i.,To become woven or interwoven.


In [32]:
embeddings = retriever.encode(dict_df['Definition'])

embeddings.shape

(207, 384)

In [33]:
matrix = embeddings

In [34]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [45]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        if dict_df.iloc[i]["Word"] != dict_df.iloc[j]["Word"]:
            similarities.append([dict_df.iloc[i]["Word"], dict_df.iloc[j]["Word"], cosine_similarity(a, b)/math.dist(a, b)])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

Unnamed: 0,word_1,word_2,similarity
0,BOB,CIRCLE,0.022871
1,BOB,CIRCLE,0.052210
2,BOB,CIRCLE,0.051098
3,BOB,CIRCLE,0.046535
4,BOB,CIRCLE,0.026191
...,...,...,...
19235,TRIANGLE,WEAVE,-0.014275
19236,TRIANGLE,WEAVE,-0.007639
19237,TRIANGLE,WEAVE,-0.003509
19238,TRIANGLE,WEAVE,-0.004482


In [46]:
df = df.groupby(['word_1', 'word_2'])['similarity'].max().reset_index()

df

Unnamed: 0,word_1,word_2,similarity
0,BOB,CIRCLE,0.076438
1,BOB,CRAB,0.088341
2,BOB,CROSS,0.096764
3,BOB,DIAMOND,0.065631
4,BOB,FEAST,0.061804
...,...,...,...
86,SQUARE,TRIANGLE,0.164388
87,SQUARE,WEAVE,0.072682
88,SQUID,TRIANGLE,0.046427
89,SQUID,WEAVE,0.019464


In [48]:
df = df.drop_duplicates()
df = df.dropna()
df = df.sort_values(by="similarity", ascending=False)

df

Unnamed: 0,word_1,word_2,similarity
53,DIAMOND,TRIANGLE,0.165109
86,SQUARE,TRIANGLE,0.164388
51,DIAMOND,SQUARE,0.143582
42,CROSS,SQUARE,0.130750
38,CROSS,FREE,0.120162
...,...,...,...
46,DIAMOND,FEAST,0.032106
67,FREE,SQUID,0.023993
61,FEAST,TRIANGLE,0.021840
60,FEAST,SQUID,0.021186


In [49]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]

relation_dict

{('DIAMOND', 'TRIANGLE'): 0.16510921904059328,
 ('TRIANGLE', 'DIAMOND'): 0.16510921904059328,
 ('SQUARE', 'TRIANGLE'): 0.16438810196110556,
 ('TRIANGLE', 'SQUARE'): 0.16438810196110556,
 ('DIAMOND', 'SQUARE'): 0.14358190650554173,
 ('SQUARE', 'DIAMOND'): 0.14358190650554173,
 ('CROSS', 'SQUARE'): 0.13075001456121205,
 ('SQUARE', 'CROSS'): 0.13075001456121205,
 ('CROSS', 'FREE'): 0.12016224599830588,
 ('FREE', 'CROSS'): 0.12016224599830588,
 ('SPONGE', 'SQUID'): 0.11652280733461742,
 ('SQUID', 'SPONGE'): 0.11652280733461742,
 ('BOB', 'SQUID'): 0.11273099537974587,
 ('SQUID', 'BOB'): 0.11273099537974587,
 ('BOB', 'HOOK'): 0.11110173989673934,
 ('HOOK', 'BOB'): 0.11110173989673934,
 ('BOB', 'SQUARE'): 0.10812088611978303,
 ('SQUARE', 'BOB'): 0.10812088611978303,
 ('CROSS', 'RAY'): 0.10643764492423918,
 ('RAY', 'CROSS'): 0.10643764492423918,
 ('CIRCLE', 'DIAMOND'): 0.10366939973689306,
 ('DIAMOND', 'CIRCLE'): 0.10366939973689306,
 ('CIRCLE', 'TRIANGLE'): 0.10337803940102513,
 ('TRIANGLE', 

In [50]:
sim_4 = []

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

for i, a in enumerate(words):
    for j in range(i + 1, len(words)):
        b = words[j]
        for k in range(j + 1, len(words)):
            c = words[k]
            for l in range(k + 1, len(words)):
                d = words[l]
                try:
                    score = similarity_4(a, b, c, d)
                        
                    index = 0
                    
                    while index < len(sim_4) and score < sim_4[index][1]:
                        index += 1
                    sim_4.insert(index, ([a, b, c, d], score))
                except:
                    pass
                    

sim_4

[(['DIAMOND', 'SQUARE', 'TRIANGLE', 'CROSS'], 0.7862252689556544),
 (['CIRCLE', 'DIAMOND', 'SQUARE', 'TRIANGLE'], 0.7671103411211279),
 (['RAY', 'DIAMOND', 'SQUARE', 'TRIANGLE'], 0.7321133218919736),
 (['DIAMOND', 'SQUARE', 'TRIANGLE', 'BOB'], 0.7201395668107331),
 (['DIAMOND', 'SQUARE', 'TRIANGLE', 'HOOK'], 0.7148728541485285),
 (['CRAB', 'DIAMOND', 'SQUARE', 'TRIANGLE'], 0.6896188541940225),
 (['RAY', 'SQUARE', 'TRIANGLE', 'CROSS'], 0.6643984334974614),
 (['SQUARE', 'TRIANGLE', 'BOB', 'CROSS'], 0.6635482694198903),
 (['CIRCLE', 'SQUARE', 'TRIANGLE', 'CROSS'], 0.6581443758250044),
 (['RAY', 'CIRCLE', 'DIAMOND', 'TRIANGLE'], 0.6418287504794546),
 (['SPONGE', 'DIAMOND', 'SQUARE', 'TRIANGLE'], 0.6403764356540684),
 (['RAY', 'DIAMOND', 'SQUARE', 'CROSS'], 0.6398753625693053),
 (['CIRCLE', 'DIAMOND', 'SQUARE', 'CROSS'], 0.6395913948288702),
 (['DIAMOND', 'SQUARE', 'BOB', 'CROSS'], 0.6370260752956106),
 (['CIRCLE', 'DIAMOND', 'TRIANGLE', 'CROSS'], 0.6369801441095857),
 (['SQUARE', 'TRIANGLE