In [58]:
import numpy as np
import pandas as pd
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [60]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [None]:
words = [
    "PLUTO", "RAY", "SPONGE", "SQUID", 
    "CIRCLE", "DIAMOND", "SQUARE", "TRIANGLE",
    "BOB", "CROSS", "HOOK", "WEAVE",
    "FEAST", "FREE", "PANTS", "THAT"
]

In [None]:
dict_df["Word"] = dict_df["Word"].str.upper()

In [None]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

In [None]:
dict_df['word_number'] = dict_df.groupby('Word').cumcount() + 1

dict_df['Word'] = dict_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

dict_df

In [None]:
dict_df.shape

In [None]:
embeddings = retriever.encode(dict_df['Definition'])

embeddings.shape

In [None]:
matrix = embeddings

In [None]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [None]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = dict_df.iloc[i]["Word"]
        word2 = dict_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            similarities.append([dict_df.iloc[i]["Word"], dict_df.iloc[j]["Word"], cosine_similarity(a, b)/math.dist(a, b)])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

In [None]:
df = df.drop_duplicates()
df = df.dropna()
df = df.sort_values(by="similarity", ascending=False)

df[df["similarity"] > 0].shape

In [None]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]

relation_dict

In [None]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

In [None]:
from itertools import combinations

def check_overlap(combo):
    w1 = combo[0]
    w1 = w1[0: w1.index("_")].strip()
    w2 = combo[1]
    w2 = w2[0: w2.index("_")].strip() 
    w3 = combo[2]
    w3 = w3[0: w3.index("_")].strip() 
    w4 = combo[3]
    w4 = w4[0: w4.index("_")].strip() 
    
    return not (w1 == w2 or w1 == w3 or w1 == w4 or w2 == w3 or w2 == w4 or w3 == w4)

sim_4 = []

specified_words = dict_df["Word"]

specified_words

In [None]:
import heapq

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(10, sim_heap)  # Adjust the number of results as needed

In [None]:
result = heapq.nlargest(10, sim_heap) 
result

In [None]:
removals = result[0][1]

for word in removals:
    specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
    
specified_words

In [None]:
sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(10, sim_heap)  

In [None]:
result

In [None]:
removals = result[2][1]

for word in removals:
    specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
    
specified_words

In [None]:
sim_scores = {}
for i, a in enumerate(specified_words):
    for j in range(i + 1, len(specified_words)):
        b = specified_words[j]
        if a[0:a.index("_")] == b[0:b.index("_")]:
            continue
        for k in range(j + 1, len(specified_words)):
            c = specified_words[k]
            if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                continue
            for l in range(k + 1, len(specified_words)):
                d = specified_words[l]
                if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                    continue
                
                sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

sim_heap = []
for (a, b, c, d), score in sim_scores.items():
    heapq.heappush(sim_heap, (score, [a, b, c, d]))

result = heapq.nlargest(20, sim_heap)  

In [None]:
result