In [13]:
import numpy as np
import pandas as pd
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [15]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [16]:
words = [
    "PACK", "RAT", "DRAGON", "MOUNTAIN", 
    "CAT", "BUTTER", "HORSE", "JAM",
    "CANARY", "COW", "STUFF", "TRIANGLE",
    "SQUEEZE", "SNITCH", "FIRE", "FINK"
]

In [17]:
dict_df["Word"] = dict_df["Word"].str.upper()

In [18]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,17498,BOB,n.,Anything that hangs so as to play loosely or w...
1,17499,BOB,n.,A knot of worms or of rags on a string used in...
2,17500,BOB,n.,A small piece of cork or light wood attached t...
3,17501,BOB,n.,The ball or heavy part of a pendulum; also the...
4,17502,BOB,n.,A small wheel made of leather with rounded edg...
...,...,...,...,...
202,171769,WEAVE,v. t.,To unite as threads of any kind in such a mann...
203,171770,WEAVE,v. t.,To form as cloth by interlacing threads; to co...
204,171771,WEAVE,v. i.,To practice weaving; to work with a loom.
205,171772,WEAVE,v. i.,To become woven or interwoven.


In [19]:
dict_df['word_number'] = dict_df.groupby('Word').cumcount() + 1

dict_df['Word'] = dict_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

dict_df

Unnamed: 0,index,Word,POS,Definition,word_number
0,17498,BOB_1,n.,Anything that hangs so as to play loosely or w...,1
1,17499,BOB_2,n.,A knot of worms or of rags on a string used in...,2
2,17500,BOB_3,n.,A small piece of cork or light wood attached t...,3
3,17501,BOB_4,n.,The ball or heavy part of a pendulum; also the...,4
4,17502,BOB_5,n.,A small wheel made of leather with rounded edg...,5
...,...,...,...,...,...
202,171769,WEAVE_1,v. t.,To unite as threads of any kind in such a mann...,1
203,171770,WEAVE_2,v. t.,To form as cloth by interlacing threads; to co...,2
204,171771,WEAVE_3,v. i.,To practice weaving; to work with a loom.,3
205,171772,WEAVE_4,v. i.,To become woven or interwoven.,4


In [20]:
dict_df.shape

(207, 5)

In [21]:
embeddings = retriever.encode(dict_df['Definition'])

embeddings.shape

(207, 384)

In [22]:
matrix = embeddings

In [27]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [28]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = dict_df.iloc[i]["Word"]
        word2 = dict_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            similarities.append([dict_df.iloc[i]["Word"], dict_df.iloc[j]["Word"], cosine_similarity(a, b)/math.dist(a, b)])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

Unnamed: 0,word_1,word_2,similarity
0,BOB_1,CIRCLE_1,0.022871
1,BOB_1,CIRCLE_2,0.052210
2,BOB_1,CIRCLE_3,0.051098
3,BOB_1,CIRCLE_4,0.046535
4,BOB_1,CIRCLE_5,0.026191
...,...,...,...
19235,TRIANGLE_6,WEAVE_1,-0.014275
19236,TRIANGLE_6,WEAVE_2,-0.007639
19237,TRIANGLE_6,WEAVE_3,-0.003509
19238,TRIANGLE_6,WEAVE_4,-0.004482


In [29]:
df = df.drop_duplicates()
df = df.dropna()
df = df.sort_values(by="similarity", ascending=False)

df[df["similarity"] > 0].shape

(17271, 3)

In [30]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]

relation_dict

{('DIAMOND_2', 'TRIANGLE_1'): 0.16510921904059328,
 ('TRIANGLE_1', 'DIAMOND_2'): 0.16510921904059328,
 ('SQUARE_18', 'TRIANGLE_1'): 0.16438810196110556,
 ('TRIANGLE_1', 'SQUARE_18'): 0.16438810196110556,
 ('DIAMOND_2', 'SQUARE_18'): 0.14358190650554173,
 ('SQUARE_18', 'DIAMOND_2'): 0.14358190650554173,
 ('CROSS_23', 'SQUARE_35'): 0.13075001456121205,
 ('SQUARE_35', 'CROSS_23'): 0.13075001456121205,
 ('CROSS_15', 'FREE_7'): 0.12016224599830588,
 ('FREE_7', 'CROSS_15'): 0.12016224599830588,
 ('SQUARE_9', 'TRIANGLE_2'): 0.11835176904471804,
 ('TRIANGLE_2', 'SQUARE_9'): 0.11835176904471804,
 ('DIAMOND_2', 'SQUARE_2'): 0.1169317520781502,
 ('SQUARE_2', 'DIAMOND_2'): 0.1169317520781502,
 ('SPONGE_2', 'SQUID_1'): 0.11652280733461742,
 ('SQUID_1', 'SPONGE_2'): 0.11652280733461742,
 ('CROSS_13', 'SQUARE_26'): 0.1164798725242085,
 ('SQUARE_26', 'CROSS_13'): 0.1164798725242085,
 ('BOB_3', 'SQUID_2'): 0.11273099537974587,
 ('SQUID_2', 'BOB_3'): 0.11273099537974587,
 ('CROSS_15', 'FREE_5'): 0.11229

In [31]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

In [36]:
from itertools import combinations

def check_overlap(combo):
    w1 = combo[0]
    w1 = w1[0: w1.index("_")].strip()
    w2 = combo[1]
    w2 = w2[0: w2.index("_")].strip() 
    w3 = combo[2]
    w3 = w3[0: w3.index("_")].strip() 
    w4 = combo[3]
    w4 = w4[0: w4.index("_")].strip() 
    
    return not (w1 == w2 or w1 == w3 or w1 == w4 or w2 == w3 or w2 == w4 or w3 == w4)

sim_4 = []

specified_words = dict_df["Word"]

specified_words

0        BOB_1
1        BOB_2
2        BOB_3
3        BOB_4
4        BOB_5
        ...   
202    WEAVE_1
203    WEAVE_2
204    WEAVE_3
205    WEAVE_4
206    WEAVE_5
Name: Word, Length: 207, dtype: object

In [44]:
import heapq

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(10, sim_heap)  # Adjust the number of results as needed

In [48]:
result = heapq.nlargest(10, sim_heap) 
result

[(0.7592679175944462, ['CIRCLE_1', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.7383892416459706, ['CIRCLE_2', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.7212253698219598, ['CROSS_13', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.6790961833611706, ['DIAMOND_2', 'RAY_11', 'SQUARE_18', 'TRIANGLE_1']),
 (0.674911094352688, ['CROSS_12', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.670637384587169, ['CROSS_1', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.6699289613527875, ['BOB_20', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.6692517441413264, ['CROSS_14', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1']),
 (0.6646958869004159, ['DIAMOND_2', 'RAY_4', 'SQUARE_18', 'TRIANGLE_1']),
 (0.6529503932154969, ['CROSS_7', 'DIAMOND_2', 'SQUARE_18', 'TRIANGLE_1'])]

In [49]:
removals = result[0][1]

for word in removals:
    specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
    
specified_words

['BOB_1',
 'BOB_2',
 'BOB_3',
 'BOB_4',
 'BOB_5',
 'BOB_6',
 'BOB_7',
 'BOB_8',
 'BOB_9',
 'BOB_10',
 'BOB_11',
 'BOB_12',
 'BOB_13',
 'BOB_14',
 'BOB_15',
 'BOB_16',
 'BOB_17',
 'BOB_18',
 'BOB_19',
 'BOB_20',
 'CRAB_1',
 'CRAB_2',
 'CRAB_3',
 'CRAB_4',
 'CRAB_5',
 'CRAB_6',
 'CRAB_7',
 'CRAB_8',
 'CRAB_9',
 'CRAB_10',
 'CRAB_11',
 'CRAB_12',
 'CROSS_1',
 'CROSS_2',
 'CROSS_3',
 'CROSS_4',
 'CROSS_5',
 'CROSS_6',
 'CROSS_7',
 'CROSS_8',
 'CROSS_9',
 'CROSS_10',
 'CROSS_11',
 'CROSS_12',
 'CROSS_13',
 'CROSS_14',
 'CROSS_15',
 'CROSS_16',
 'CROSS_17',
 'CROSS_18',
 'CROSS_19',
 'CROSS_20',
 'CROSS_21',
 'CROSS_22',
 'CROSS_23',
 'CROSS_24',
 'CROSS_25',
 'CROSS_26',
 'CROSS_27',
 'CROSS_28',
 'CROSS_29',
 'CROSS_30',
 'CROSS_31',
 'FEAST_1',
 'FEAST_2',
 'FEAST_3',
 'FEAST_4',
 'FEAST_5',
 'FEAST_6',
 'FEAST_7',
 'FREE_1',
 'FREE_2',
 'FREE_3',
 'FREE_4',
 'FREE_5',
 'FREE_6',
 'FREE_7',
 'FREE_8',
 'FREE_9',
 'FREE_10',
 'FREE_11',
 'FREE_12',
 'FREE_13',
 'FREE_14',
 'FREE_15',
 'FRE

In [50]:
sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(10, sim_heap)  

In [51]:
result

[(0.5388981362169898, ['BOB_3', 'CRAB_4', 'HOOK_1', 'SQUID_2']),
 (0.5144509027094982, ['BOB_3', 'HOOK_1', 'RAY_6', 'SQUID_2']),
 (0.5084620456483094, ['CRAB_1', 'RAY_6', 'SPONGE_2', 'SQUID_1']),
 (0.5010078188612404, ['CRAB_9', 'CROSS_15', 'FEAST_7', 'FREE_8']),
 (0.5007222843959873, ['BOB_3', 'CRAB_1', 'HOOK_1', 'SQUID_2']),
 (0.49725465955361736, ['BOB_3', 'CRAB_4', 'RAY_6', 'SQUID_2']),
 (0.49538423425309475, ['CRAB_1', 'RAY_7', 'SPONGE_2', 'SQUID_1']),
 (0.4881089573468257, ['BOB_3', 'CRAB_10', 'HOOK_1', 'SQUID_2']),
 (0.4878959134107238, ['BOB_3', 'CRAB_1', 'RAY_6', 'SQUID_2']),
 (0.48698877706323607, ['CRAB_11', 'CROSS_21', 'HOOK_11', 'RAY_11'])]

In [52]:
removals = result[2][1]

for word in removals:
    specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
    
specified_words

['BOB_1',
 'BOB_2',
 'BOB_3',
 'BOB_4',
 'BOB_5',
 'BOB_6',
 'BOB_7',
 'BOB_8',
 'BOB_9',
 'BOB_10',
 'BOB_11',
 'BOB_12',
 'BOB_13',
 'BOB_14',
 'BOB_15',
 'BOB_16',
 'BOB_17',
 'BOB_18',
 'BOB_19',
 'BOB_20',
 'CROSS_1',
 'CROSS_2',
 'CROSS_3',
 'CROSS_4',
 'CROSS_5',
 'CROSS_6',
 'CROSS_7',
 'CROSS_8',
 'CROSS_9',
 'CROSS_10',
 'CROSS_11',
 'CROSS_12',
 'CROSS_13',
 'CROSS_14',
 'CROSS_15',
 'CROSS_16',
 'CROSS_17',
 'CROSS_18',
 'CROSS_19',
 'CROSS_20',
 'CROSS_21',
 'CROSS_22',
 'CROSS_23',
 'CROSS_24',
 'CROSS_25',
 'CROSS_26',
 'CROSS_27',
 'CROSS_28',
 'CROSS_29',
 'CROSS_30',
 'CROSS_31',
 'FEAST_1',
 'FEAST_2',
 'FEAST_3',
 'FEAST_4',
 'FEAST_5',
 'FEAST_6',
 'FEAST_7',
 'FREE_1',
 'FREE_2',
 'FREE_3',
 'FREE_4',
 'FREE_5',
 'FREE_6',
 'FREE_7',
 'FREE_8',
 'FREE_9',
 'FREE_10',
 'FREE_11',
 'FREE_12',
 'FREE_13',
 'FREE_14',
 'FREE_15',
 'FREE_16',
 'FREE_17',
 'FREE_18',
 'FREE_19',
 'FREE_20',
 'FREE_21',
 'FREE_22',
 'FREE_23',
 'FREE_24',
 'HOOK_1',
 'HOOK_2',
 'HOOK_3',

In [56]:
sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(20, sim_heap)  

In [57]:
result

[(0.4598253956623111, ['BOB_19', 'CROSS_23', 'FREE_12', 'HOOK_8']),
 (0.4572954417799653, ['BOB_19', 'CROSS_23', 'FREE_22', 'HOOK_8']),
 (0.43147372950276913, ['CROSS_15', 'FEAST_7', 'FREE_12', 'HOOK_8']),
 (0.4180698045298762, ['BOB_19', 'CROSS_23', 'FREE_14', 'HOOK_8']),
 (0.41534531607761693, ['CROSS_15', 'FEAST_7', 'FREE_22', 'HOOK_8']),
 (0.414734836875873, ['BOB_19', 'CROSS_23', 'FREE_11', 'HOOK_8']),
 (0.41267333053821575, ['BOB_19', 'CROSS_23', 'FREE_23', 'HOOK_8']),
 (0.41166386153397816, ['BOB_19', 'CROSS_15', 'FREE_12', 'HOOK_8']),
 (0.4103419735036203, ['BOB_19', 'CROSS_15', 'FEAST_7', 'FREE_12']),
 (0.4093626864335424, ['BOB_11', 'CROSS_15', 'FEAST_7', 'FREE_8']),
 (0.4091278715676069, ['BOB_11', 'CROSS_15', 'FEAST_7', 'FREE_12']),
 (0.40812870079943103, ['BOB_16', 'CROSS_23', 'FREE_22', 'HOOK_8']),
 (0.4077836256443871, ['BOB_17', 'CROSS_23', 'FREE_22', 'HOOK_8']),
 (0.4071188206273677, ['BOB_17', 'CROSS_23', 'FREE_12', 'HOOK_8']),
 (0.4066377584391227, ['BOB_19', 'CROSS_