In [1]:
import numpy as np
import pandas as pd
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [3]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [4]:
words = [
    "PACK", "RAT", "DRAGON", "MOUNTAIN", 
    "CAT", "BUTTER", "HORSE", "JAM",
    "CANARY", "COW", "STUFF", "TRIANGLE",
    "SQUEEZE", "SNITCH", "FIRE", "FINK"
]

In [5]:
dict_df["Word"] = dict_df["Word"].str.upper()

In [6]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,21249,BUTTER,n.,An oily unctuous substance obtained from cream...
1,21250,BUTTER,n.,Any substance resembling butter in degree of c...
2,21253,BUTTER,v. t.,To cover or spread with butter.
3,21254,BUTTER,v. t.,To increase as stakes at every throw or every ...
4,21255,BUTTER,n.,One who or that which butts.
...,...,...,...,...
140,161759,TRIANGLE,n.,An instrument of percussion usually made of a ...
141,161760,TRIANGLE,n.,A draughtsman's square in the form of a right-...
142,161761,TRIANGLE,n.,A kind of frame formed of three poles stuck in...
143,161762,TRIANGLE,n.,A small constellation situated between Aries a...


In [7]:
dict_df['word_number'] = dict_df.groupby('Word').cumcount() + 1

dict_df['Word'] = dict_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

dict_df

Unnamed: 0,index,Word,POS,Definition,word_number
0,21249,BUTTER_1,n.,An oily unctuous substance obtained from cream...,1
1,21250,BUTTER_2,n.,Any substance resembling butter in degree of c...,2
2,21253,BUTTER_3,v. t.,To cover or spread with butter.,3
3,21254,BUTTER_4,v. t.,To increase as stakes at every throw or every ...,4
4,21255,BUTTER_5,n.,One who or that which butts.,5
...,...,...,...,...,...
140,161759,TRIANGLE_2,n.,An instrument of percussion usually made of a ...,2
141,161760,TRIANGLE_3,n.,A draughtsman's square in the form of a right-...,3
142,161761,TRIANGLE_4,n.,A kind of frame formed of three poles stuck in...,4
143,161762,TRIANGLE_5,n.,A small constellation situated between Aries a...,5


In [8]:
dict_df.shape

(145, 5)

In [9]:
embeddings = retriever.encode(dict_df['Definition'])

embeddings.shape

(145, 384)

In [10]:
matrix = embeddings

In [11]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [12]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = dict_df.iloc[i]["Word"]
        word2 = dict_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            similarities.append([dict_df.iloc[i]["Word"], dict_df.iloc[j]["Word"], cosine_similarity(a, b)/math.dist(a, b)])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

Unnamed: 0,word_1,word_2,similarity
0,BUTTER_1,CANARY_1,0.008107
1,BUTTER_1,CANARY_2,0.032113
2,BUTTER_1,CANARY_3,0.025658
3,BUTTER_1,CANARY_4,0.014139
4,BUTTER_1,CANARY_5,0.018282
...,...,...,...
9417,STUFF_18,TRIANGLE_2,-0.000936
9418,STUFF_18,TRIANGLE_3,0.010348
9419,STUFF_18,TRIANGLE_4,-0.008761
9420,STUFF_18,TRIANGLE_5,-0.011309


In [13]:
df = df.drop_duplicates()
df = df.dropna()
df = df.sort_values(by="similarity", ascending=False)

df[df["similarity"] > 0].shape

(8500, 3)

In [15]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]

relation_dict

{('JAM_3', 'STUFF_10'): 0.17304220730495526,
 ('STUFF_10', 'JAM_3'): 0.17304220730495526,
 ('SQUEEZE_4', 'STUFF_10'): 0.16775744834835582,
 ('STUFF_10', 'SQUEEZE_4'): 0.16775744834835582,
 ('JAM_3', 'SQUEEZE_4'): 0.16253552323301995,
 ('SQUEEZE_4', 'JAM_3'): 0.16253552323301995,
 ('PACK_14', 'STUFF_11'): 0.14532474077744387,
 ('STUFF_11', 'PACK_14'): 0.14532474077744387,
 ('PACK_14', 'STUFF_10'): 0.138524232817691,
 ('STUFF_10', 'PACK_14'): 0.138524232817691,
 ('PACK_14', 'SQUEEZE_4'): 0.12477023733524296,
 ('SQUEEZE_4', 'PACK_14'): 0.12477023733524296,
 ('SQUEEZE_4', 'STUFF_16'): 0.1228953363071404,
 ('STUFF_16', 'SQUEEZE_4'): 0.1228953363071404,
 ('HORSE_5', 'TRIANGLE_4'): 0.12117177055547135,
 ('TRIANGLE_4', 'HORSE_5'): 0.12117177055547135,
 ('PACK_14', 'STUFF_15'): 0.1107306338783885,
 ('STUFF_15', 'PACK_14'): 0.1107306338783885,
 ('JAM_3', 'PACK_14'): 0.11039654374741797,
 ('PACK_14', 'JAM_3'): 0.11039654374741797,
 ('HORSE_12', 'PACK_20'): 0.10972028397176875,
 ('PACK_20', 'HORSE

In [None]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

In [16]:
from itertools import combinations

def check_overlap(combo):
    w1 = combo[0]
    w1 = w1[0: w1.index("_")].strip()
    w2 = combo[1]
    w2 = w2[0: w2.index("_")].strip() 
    w3 = combo[2]
    w3 = w3[0: w3.index("_")].strip() 
    w4 = combo[3]
    w4 = w4[0: w4.index("_")].strip() 
    
    return not (w1 == w2 or w1 == w3 or w1 == w4 or w2 == w3 or w2 == w4 or w3 == w4)

sim_4 = []

specified_words = dict_df["Word"]

specified_words

0        BUTTER_1
1        BUTTER_2
2        BUTTER_3
3        BUTTER_4
4        BUTTER_5
          ...    
140    TRIANGLE_2
141    TRIANGLE_3
142    TRIANGLE_4
143    TRIANGLE_5
144    TRIANGLE_6
Name: Word, Length: 145, dtype: object

In [17]:
import heapq

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(10, sim_heap)  # Adjust the number of results as needed

In [18]:
result = heapq.nlargest(10, sim_heap) 
result

[(0.8770261927866829, ['JAM_3', 'PACK_14', 'SQUEEZE_4', 'STUFF_10']),
 (0.7459515444889033, ['JAM_3', 'PACK_14', 'SQUEEZE_4', 'STUFF_11']),
 (0.727514664568595, ['JAM_3', 'PACK_13', 'SQUEEZE_4', 'STUFF_10']),
 (0.7214545595104885, ['JAM_3', 'PACK_24', 'SQUEEZE_4', 'STUFF_10']),
 (0.7197870614606461, ['JAM_3', 'PACK_20', 'SQUEEZE_4', 'STUFF_10']),
 (0.7146951907947726, ['COW_4', 'JAM_3', 'SQUEEZE_4', 'STUFF_10']),
 (0.7122791788349104, ['FIRE_10', 'JAM_3', 'SQUEEZE_4', 'STUFF_10']),
 (0.7033516714023981, ['JAM_3', 'PACK_26', 'SQUEEZE_4', 'STUFF_10']),
 (0.6992037538128218, ['JAM_3', 'PACK_14', 'SQUEEZE_1', 'STUFF_10']),
 (0.6969825861376036, ['CANARY_7', 'JAM_3', 'SQUEEZE_4', 'STUFF_10'])]

In [19]:
removals = result[0][1]

for word in removals:
    specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
    
specified_words

['BUTTER_1',
 'BUTTER_2',
 'BUTTER_3',
 'BUTTER_4',
 'BUTTER_5',
 'CANARY_1',
 'CANARY_2',
 'CANARY_3',
 'CANARY_4',
 'CANARY_5',
 'CANARY_6',
 'CANARY_7',
 'CAT_1',
 'CAT_2',
 'CAT_3',
 'CAT_4',
 'CAT_5',
 'CAT_6',
 'CAT_7',
 'COW_1',
 'COW_2',
 'COW_3',
 'COW_4',
 'COW_5',
 'DRAGON_1',
 'DRAGON_2',
 'DRAGON_3',
 'DRAGON_4',
 'DRAGON_5',
 'DRAGON_6',
 'DRAGON_7',
 'DRAGON_8',
 'FIRE_1',
 'FIRE_2',
 'FIRE_3',
 'FIRE_4',
 'FIRE_5',
 'FIRE_6',
 'FIRE_7',
 'FIRE_8',
 'FIRE_9',
 'FIRE_10',
 'FIRE_11',
 'FIRE_12',
 'FIRE_13',
 'FIRE_14',
 'FIRE_15',
 'FIRE_16',
 'FIRE_17',
 'FIRE_18',
 'FIRE_19',
 'FIRE_20',
 'FIRE_21',
 'HORSE_1',
 'HORSE_2',
 'HORSE_3',
 'HORSE_4',
 'HORSE_5',
 'HORSE_6',
 'HORSE_7',
 'HORSE_8',
 'HORSE_9',
 'HORSE_10',
 'HORSE_11',
 'HORSE_12',
 'HORSE_13',
 'HORSE_14',
 'HORSE_15',
 'HORSE_16',
 'HORSE_17',
 'MOUNTAIN_1',
 'MOUNTAIN_2',
 'MOUNTAIN_3',
 'MOUNTAIN_4',
 'MOUNTAIN_5',
 'RAT_1',
 'RAT_2',
 'RAT_3',
 'RAT_4',
 'RAT_5',
 'TRIANGLE_1',
 'TRIANGLE_2',
 'TRIANGLE

In [20]:
sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(10, sim_heap)  

In [21]:
result

[(0.5382584714322605, ['CAT_3', 'DRAGON_5', 'HORSE_5', 'TRIANGLE_4']),
 (0.52669100716111, ['DRAGON_5', 'FIRE_9', 'HORSE_5', 'TRIANGLE_4']),
 (0.5012709389461204, ['DRAGON_5', 'FIRE_21', 'HORSE_5', 'TRIANGLE_4']),
 (0.48412101705727084, ['CAT_2', 'DRAGON_5', 'HORSE_5', 'TRIANGLE_4']),
 (0.4522310570838063, ['CAT_3', 'DRAGON_5', 'HORSE_4', 'TRIANGLE_4']),
 (0.44625334526634053, ['COW_5', 'DRAGON_5', 'HORSE_5', 'TRIANGLE_4']),
 (0.4414750575987826, ['CAT_5', 'DRAGON_5', 'HORSE_5', 'TRIANGLE_4']),
 (0.4412561133566408, ['DRAGON_5', 'FIRE_16', 'HORSE_5', 'TRIANGLE_4']),
 (0.44125063266436143, ['DRAGON_5', 'FIRE_9', 'HORSE_16', 'TRIANGLE_4']),
 (0.4374213182301791, ['CAT_7', 'DRAGON_5', 'HORSE_5', 'TRIANGLE_4'])]

In [22]:
removals = result[2][1]

for word in removals:
    specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
    
specified_words

['BUTTER_1',
 'BUTTER_2',
 'BUTTER_3',
 'BUTTER_4',
 'BUTTER_5',
 'CANARY_1',
 'CANARY_2',
 'CANARY_3',
 'CANARY_4',
 'CANARY_5',
 'CANARY_6',
 'CANARY_7',
 'CAT_1',
 'CAT_2',
 'CAT_3',
 'CAT_4',
 'CAT_5',
 'CAT_6',
 'CAT_7',
 'COW_1',
 'COW_2',
 'COW_3',
 'COW_4',
 'COW_5',
 'MOUNTAIN_1',
 'MOUNTAIN_2',
 'MOUNTAIN_3',
 'MOUNTAIN_4',
 'MOUNTAIN_5',
 'RAT_1',
 'RAT_2',
 'RAT_3',
 'RAT_4',
 'RAT_5']

In [None]:
sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(20, sim_heap)  

In [None]:
result

[(0.4598253956623111, ['BOB_19', 'CROSS_23', 'FREE_12', 'HOOK_8']),
 (0.4572954417799653, ['BOB_19', 'CROSS_23', 'FREE_22', 'HOOK_8']),
 (0.43147372950276913, ['CROSS_15', 'FEAST_7', 'FREE_12', 'HOOK_8']),
 (0.4180698045298762, ['BOB_19', 'CROSS_23', 'FREE_14', 'HOOK_8']),
 (0.41534531607761693, ['CROSS_15', 'FEAST_7', 'FREE_22', 'HOOK_8']),
 (0.414734836875873, ['BOB_19', 'CROSS_23', 'FREE_11', 'HOOK_8']),
 (0.41267333053821575, ['BOB_19', 'CROSS_23', 'FREE_23', 'HOOK_8']),
 (0.41166386153397816, ['BOB_19', 'CROSS_15', 'FREE_12', 'HOOK_8']),
 (0.4103419735036203, ['BOB_19', 'CROSS_15', 'FEAST_7', 'FREE_12']),
 (0.4093626864335424, ['BOB_11', 'CROSS_15', 'FEAST_7', 'FREE_8']),
 (0.4091278715676069, ['BOB_11', 'CROSS_15', 'FEAST_7', 'FREE_12']),
 (0.40812870079943103, ['BOB_16', 'CROSS_23', 'FREE_22', 'HOOK_8']),
 (0.4077836256443871, ['BOB_17', 'CROSS_23', 'FREE_22', 'HOOK_8']),
 (0.4071188206273677, ['BOB_17', 'CROSS_23', 'FREE_12', 'HOOK_8']),
 (0.4066377584391227, ['BOB_19', 'CROSS_