In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import heapq
from pprint import pprint
import wikipedia

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
base_url = "https://en.wikipedia.org/wiki/"

words = [
    "BUFFALO", "COW", "GOAT", "SHEEP", 
    "BEAM", "GLOW", "RADIATE", "SHINE",
    "FLOOR", "HORSE", "RINGS", "VAULT",
    "CUTIE", "ENVY", "EXCEL", "SEEDY"
]

In [8]:
wiki_dict = {"Word": [], "Definition": []}
soups = [BeautifulSoup(requests.get(f'{base_url}{word.capitalize()}').content, 'html.parser') for word in words]

for i, soup in enumerate(soups):
    content = soup.find("div", id="mw-content-text")

    first_para = content.find("p").text

    items = [item.text for item in content.find_all("li")]

    if f'may refer to:' in first_para:
        for item in items:
            item = item.lower()
            if words[i] in item:
                item = item.sub(words[i].lower(), "")

            if len(item) > 5:
                wiki_dict["Word"].append(words[i])
                wiki_dict["Definition"].append("".join(item))


for word in words:
    options = wikipedia.search(word.capitalize(), results=10)
    for option in options:
        try:
            summary = wikipedia.summary(option, sentences=1, auto_suggest=False)
            # summary = page.summary
            if len(summary) > 10:
                wiki_dict["Word"].append(word)
                wiki_dict["Definition"].append(summary)
        except wikipedia.exceptions.DisambiguationError as e:
            pass
wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df



  lis = BeautifulSoup(html).find_all('li')


Unnamed: 0,Word,Definition
0,BEAM,"light beam, or beam of light, a directional pr..."
1,BEAM,laser beam
2,BEAM,radio beam
3,BEAM,"particle beam, a stream of charged or neutral ..."
4,BEAM,"charged particle beam, a spatially localized g..."
...,...,...
548,SEEDY,Seedy Ishmail Njie (born 1 December 1994) is a...
549,SEEDY,Pachyderm Recording Studio is a residential mu...
550,SEEDY,A Full Spoon of Seedy Blues is the fourth albu...
551,SEEDY,"Seedy Bah (born July 6, 1992) is a Gambian foo..."


In [10]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df["Word"] = dict_df["Word"].str.upper()

In [11]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,13608,BEAM,n.,Any large piece of timber or iron long in prop...
1,13609,BEAM,n.,One of the principal horizontal timbers of a b...
2,13610,BEAM,n.,The width of a vessel; as one vessel is said t...
3,13611,BEAM,n.,The bar of a balance from the ends of which th...
4,13612,BEAM,n.,The principal stem or horn of a stag or other ...
...,...,...,...,...
111,168075,VAULT,n.,A leap by aid of the hands or of a pole spring...
112,168078,VAULT,v. t.,To form with a vault or to cover with a vault;...
113,168079,VAULT,v. i.,To leap over; esp. to leap over by aid of the ...
114,168080,VAULT,n.,To leap; to bound; to jump; to spring.


In [12]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

wiki_df

Unnamed: 0,Word,Definition
0,BEAM,"light beam, or beam of light, a directional pr..."
1,BEAM,laser beam
2,BEAM,radio beam
3,BEAM,"particle beam, a stream of charged or neutral ..."
4,BEAM,"charged particle beam, a spatially localized g..."
...,...,...
111,VAULT,A leap by aid of the hands or of a pole spring...
112,VAULT,To form with a vault or to cover with a vault;...
113,VAULT,To leap over; esp. to leap over by aid of the ...
114,VAULT,To leap; to bound; to jump; to spring.


In [13]:
wiki_df['word_number'] = wiki_df.groupby('Word').cumcount() + 1

wiki_df['Word'] = wiki_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

wiki_df = wiki_df.dropna()

wiki_df

Unnamed: 0,Word,Definition,word_number
0,BEAM_1,"light beam, or beam of light, a directional pr...",1
1,BEAM_2,laser beam,2
2,BEAM_3,radio beam,3
3,BEAM_4,"particle beam, a stream of charged or neutral ...",4
4,BEAM_5,"charged particle beam, a spatially localized g...",5
...,...,...,...
111,VAULT_51,A leap by aid of the hands or of a pole spring...,51
112,VAULT_52,To form with a vault or to cover with a vault;...,52
113,VAULT_53,To leap over; esp. to leap over by aid of the ...,53
114,VAULT_54,To leap; to bound; to jump; to spring.,54


In [14]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [15]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word           object
Definition     object
word_number     int64
dtype: object

In [16]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embeddings.shape

(669, 384)

In [17]:
matrix = embeddings

In [18]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [19]:
matrix.shape

(669, 384)

In [20]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            sim = cosine_similarity(a, b)/math.dist(a, b)
            if math.isinf(sim):
                sim = 1
            similarities.append([wiki_df.iloc[i]["Word"], wiki_df.iloc[j]["Word"], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

  sim = cosine_similarity(a, b)/math.dist(a, b)


Unnamed: 0,word_1,word_2,similarity
0,BEAM_1,GLOW_1,0.035505
1,BEAM_1,GLOW_2,0.032546
2,BEAM_1,GLOW_3,0.037327
3,BEAM_1,GLOW_4,0.037377
4,BEAM_1,GLOW_5,0.054260
...,...,...,...
193596,SHINE_181,VAULT_51,0.007824
193597,SHINE_181,VAULT_52,0.012159
193598,SHINE_181,VAULT_53,0.003385
193599,SHINE_181,VAULT_54,0.010487


In [21]:
df = df[df["similarity"] > 0.03]

df

Unnamed: 0,word_1,word_2,similarity
0,BEAM_1,GLOW_1,0.035505
1,BEAM_1,GLOW_2,0.032546
2,BEAM_1,GLOW_3,0.037327
3,BEAM_1,GLOW_4,0.037377
4,BEAM_1,GLOW_5,0.054260
...,...,...,...
193563,SHINE_178,VAULT_48,0.045027
193584,SHINE_180,VAULT_49,0.039800
193585,SHINE_180,VAULT_50,0.033651
193586,SHINE_180,VAULT_51,0.034322


In [22]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [23]:
specified_words = list(wiki_df["Word"])

In [24]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [31]:
import heapq

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_groups(words):
    df_dict_scores = {
        'a': [],
        'a_origin': [],
        'b': [],
        'b_origin': [],
        'c': [],
        'c_origin': [],
        'd': [],
        'd_origin': [],
        'sim': [],
    }
    for i, a in enumerate(words):
        printProgressBar(iteration=i, total=len(words))
        for j in range(i + 1, len(words)):
            b = words[j]
            if a[0:a.index("_")] == b[0:b.index("_")]:
                continue
            if (a, b) not in relation_dict:
                continue
            for k in range(j + 1, len(words)):
                c = words[k]
                if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                    continue
                if (a, c) not in relation_dict or (b, c) not in relation_dict:
                    continue
                for l in range(k + 1, len(words)):
                    d = specified_words[l]
                    
                    if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                        continue
                    if ((a, d) not in relation_dict) or ((b, d) not in relation_dict) or ((c, d) not in relation_dict):
                        continue
                    
                    df_dict_scores["a"].append(a)
                    df_dict_scores["a_origin"].append(a.split('_')[0])
                    df_dict_scores["b"].append(b)
                    df_dict_scores["b_origin"].append(b.split('_')[0])
                    df_dict_scores["c"].append(c)
                    df_dict_scores["c_origin"].append(c.split('_')[0])
                    df_dict_scores["d"].append(d)
                    df_dict_scores["d_origin"].append(d.split('_')[0])
                    df_dict_scores["sim"].append(similarity_4(a, b, c, d))
    
    return pd.DataFrame.from_dict(df_dict_scores)
result = find_groups(specified_words)
result

 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.9% 

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
0,BEAM_1,BEAM,GLOW_1,GLOW,SHINE_79,SHINE,RADIATE_27,RADIATE,0.296996
1,BEAM_1,BEAM,GLOW_1,GLOW,SHINE_79,SHINE,RADIATE_30,RADIATE,0.283698
2,BEAM_1,BEAM,GLOW_1,GLOW,SHINE_82,SHINE,RADIATE_27,RADIATE,0.292392
3,BEAM_1,BEAM,GLOW_1,GLOW,SHINE_82,SHINE,RADIATE_30,RADIATE,0.284720
4,BEAM_1,BEAM,GLOW_1,GLOW,SHINE_86,SHINE,RADIATE_27,RADIATE,0.276345
...,...,...,...,...,...,...,...,...,...
1119498,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_174,SHINE,VAULT_53,VAULT,0.279546
1119499,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_174,SHINE,VAULT_54,VAULT,0.271080
1119500,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_176,SHINE,VAULT_52,VAULT,0.408949
1119501,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_176,SHINE,VAULT_53,VAULT,0.333976


In [33]:
def not_three_away(df, words):
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)))]
    df = df[~((df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    df = df[~((df['c_origin'].isin(words)) & (df['d_origin'].isin(words)) & (df['a_origin'].isin(words)))]
    df = df[~((df['d_origin'].isin(words)) & (df['a_origin'].isin(words)) & (df['b_origin'].isin(words)))]
    
    return df

In [35]:
def after_win(df, words):
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    
    return df

In [34]:
not_three_away(result, ["BEAM", "GLOW", "SHINE", "RADIATE"])

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
1316,BEAM_1,BEAM,RADIATE_17,RADIATE,RINGS_37,RINGS,FLOOR_12,FLOOR,0.251996
1317,BEAM_1,BEAM,RADIATE_17,RADIATE,RINGS_37,RINGS,FLOOR_13,FLOOR,0.271128
1318,BEAM_1,BEAM,RADIATE_17,RADIATE,RINGS_87,RINGS,FLOOR_6,FLOOR,0.229682
1374,BEAM_1,BEAM,SHINE_93,SHINE,RINGS_87,RINGS,FLOOR_6,FLOOR,0.198588
1469,BEAM_1,BEAM,SHINE_153,SHINE,RINGS_87,RINGS,FLOOR_6,FLOOR,0.207825
...,...,...,...,...,...,...,...,...,...
1119498,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_174,SHINE,VAULT_53,VAULT,0.279546
1119499,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_174,SHINE,VAULT_54,VAULT,0.271080
1119500,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_176,SHINE,VAULT_52,VAULT,0.408949
1119501,HORSE_21,HORSE,RADIATE_30,RADIATE,SHINE_176,SHINE,VAULT_53,VAULT,0.333976


In [28]:
result.sort_values('sim', ascending=False)

Unnamed: 0,a,b,c,d,sim
1086847,BUFFALO_4,COW_1,GOAT_2,SHEEP_5,1.332878
1087550,BUFFALO_4,GOAT_2,SHEEP_5,COW_9,1.286560
1094981,GOAT_2,SHEEP_5,BEAM_54,HORSE_10,1.284639
1087546,BUFFALO_4,GOAT_2,SHEEP_5,HORSE_1,1.283769
1087551,BUFFALO_4,GOAT_2,SHEEP_5,HORSE_10,1.283548
...,...,...,...,...,...
70962,BEAM_12,SHINE_26,VAULT_13,CUTIE_8,0.189352
724453,GLOW_37,SHINE_119,RINGS_30,VAULT_14,0.189332
1072703,SHINE_161,RINGS_2,VAULT_13,HORSE_9,0.189235
1063683,SHINE_107,RINGS_13,GLOW_70,SEEDY_5,0.187074


In [52]:
def remove_words(specified_words, removals):
    for word in removals:
        specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
        
    return specified_words

In [53]:
from pprint import pprint

In [54]:
specified_words = remove_words(specified_words, result[0][1])

pprint(specified_words)

['RINGS_1',
 'RINGS_2',
 'RINGS_3',
 'RINGS_4',
 'RINGS_5',
 'RINGS_6',
 'RINGS_7',
 'RINGS_8',
 'RINGS_9',
 'RINGS_10',
 'RINGS_11',
 'RINGS_12',
 'RINGS_13',
 'RINGS_14',
 'RINGS_15',
 'RINGS_16',
 'RINGS_17',
 'RINGS_18',
 'RINGS_19',
 'RINGS_20',
 'RINGS_21',
 'RINGS_22',
 'RINGS_23',
 'RINGS_24',
 'RINGS_25',
 'RINGS_26',
 'RINGS_27',
 'RINGS_28',
 'RINGS_29',
 'RINGS_30',
 'RINGS_31',
 'RINGS_32',
 'RINGS_33',
 'RINGS_34',
 'RINGS_35',
 'RINGS_36',
 'RINGS_37',
 'RINGS_38',
 'RINGS_39',
 'RINGS_40',
 'RINGS_41',
 'RINGS_42',
 'RINGS_43',
 'RINGS_44',
 'RINGS_45',
 'RINGS_46',
 'RINGS_47',
 'RINGS_48',
 'RINGS_49',
 'RINGS_50',
 'RINGS_51',
 'RINGS_52',
 'RINGS_53',
 'RINGS_54',
 'RINGS_55',
 'RINGS_56',
 'RINGS_57',
 'RINGS_58',
 'RINGS_59',
 'RINGS_60',
 'RINGS_61',
 'RINGS_62',
 'RINGS_63',
 'RINGS_64',
 'RINGS_65',
 'RINGS_66',
 'RINGS_67',
 'RINGS_68',
 'RINGS_69',
 'RINGS_70',
 'RINGS_71',
 'RINGS_72',
 'RINGS_73',
 'RINGS_74',
 'RINGS_75',
 'RINGS_76',
 'RINGS_77',
 'RINGS_

In [59]:
result = find_n_groups(10, specified_words)

result

['RINGS_1', 'RINGS_2', 'RINGS_3', 'RINGS_4', 'RINGS_5', 'RINGS_6', 'RINGS_7', 'RINGS_8', 'RINGS_9', 'RINGS_10', 'RINGS_11', 'RINGS_12', 'RINGS_13', 'RINGS_14', 'RINGS_15', 'RINGS_16', 'RINGS_17', 'RINGS_18', 'RINGS_19', 'RINGS_20', 'RINGS_21', 'RINGS_22', 'RINGS_23', 'RINGS_24', 'RINGS_25', 'RINGS_26', 'RINGS_27', 'RINGS_28', 'RINGS_29', 'RINGS_30', 'RINGS_31', 'RINGS_32', 'RINGS_33', 'RINGS_34', 'RINGS_35', 'RINGS_36', 'RINGS_37', 'RINGS_38', 'RINGS_39', 'RINGS_40', 'RINGS_41', 'RINGS_42', 'RINGS_43', 'RINGS_44', 'RINGS_45', 'RINGS_46', 'RINGS_47', 'RINGS_48', 'RINGS_49', 'RINGS_50', 'RINGS_51', 'RINGS_52', 'RINGS_53', 'RINGS_54', 'RINGS_55', 'RINGS_56', 'RINGS_57', 'RINGS_58', 'RINGS_59', 'RINGS_60', 'RINGS_61', 'RINGS_62', 'RINGS_63', 'RINGS_64', 'RINGS_65', 'RINGS_66', 'RINGS_67', 'RINGS_68', 'RINGS_69', 'RINGS_70', 'RINGS_71', 'RINGS_72', 'RINGS_73', 'RINGS_74', 'RINGS_75', 'RINGS_76', 'RINGS_77', 'RINGS_78', 'RINGS_79', 'RINGS_80', 'RINGS_81', 'RINGS_82', 'RINGS_83', 'RINGS_84', 

[(0.6249239743587797, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_16']),
 (0.6076357665286503, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_4']),
 (0.5807222706026263, ['RINGS_53', 'VAULT_28', 'FLOOR_3', 'HORSE_4']),
 (0.571612982479853, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'HORSE_16']),
 (0.5674337079925199, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_8']),
 (0.5657858457310665, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_12']),
 (0.5652864262160447, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'FLOOR_11']),
 (0.5633990542802446, ['RINGS_53', 'VAULT_28', 'ENVY_2', 'HORSE_16']),
 (0.5609600504331108, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_8']),
 (0.560735446218461, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_6'])]

In [60]:
specified_words = remove_words(specified_words, result[0][1])

In [64]:
result = find_n_groups(10, specified_words)

result

['BUFFALO_1', 'BUFFALO_2', 'BUFFALO_3', 'BUFFALO_4', 'BUFFALO_5', 'BUFFALO_6', 'COW_1', 'COW_2', 'COW_3', 'COW_4', 'COW_5', 'ENVY_1', 'ENVY_2', 'ENVY_3', 'ENVY_4', 'ENVY_5', 'ENVY_6', 'ENVY_7', 'ENVY_8', 'ENVY_9', 'ENVY_10', 'ENVY_11', 'ENVY_12', 'ENVY_13', 'EXCEL_1', 'EXCEL_2', 'EXCEL_3', 'GOAT_1', 'SEEDY_1', 'SEEDY_2', 'SEEDY_3', 'SHEEP_1', 'SHEEP_2', 'SHEEP_3']
 |█████████████████████████████████████████████████████████████████████████████████████████████████---| 97.1% 

[(0.6249239743587797, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_16']),
 (0.6076357665286503, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_4']),
 (0.5807222706026263, ['RINGS_53', 'VAULT_28', 'FLOOR_3', 'HORSE_4']),
 (0.571612982479853, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'HORSE_16']),
 (0.5674337079925199, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_8']),
 (0.5657858457310665, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_12']),
 (0.5652864262160447, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'FLOOR_11']),
 (0.5633990542802446, ['RINGS_53', 'VAULT_28', 'ENVY_2', 'HORSE_16']),
 (0.5609600504331108, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_8']),
 (0.560735446218461, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_6'])]