In [1]:
import requests
import pandas as pd
import wikipedia

In [2]:
base_url = "https://en.wikipedia.org/wiki/"

words = [
    "BUFFALO", "COW", "GOAT", "SHEEP", 
    "BEAM", "GLOW", "RADIATE", "SHINE",
    "FLOOR", "HORSE", "RINGS", "VAULT",
    "CUTIE", "ENVY", "EXCEL", "SEEDY"
]

In [5]:
wiki_dict = {"Word": [], "Definition": []}
    
    soups = [BeautifulSoup(requests.get(f'{base_url}{word.capitalize()}').content, 'html.parser') for word in words]

    for i, soup in enumerate(soups):
        content = soup.find("div", id="mw-content-text")

        first_para = content.find("p").text

        items = [item.text for item in content.find_all("li")]

        if f'may refer to:' in first_para:
            for item in items:
                item = item.lower()
                if words[i] in item:
                    item = item.sub(words[i].lower(), "")

                if len(item) > 5:
                    wiki_dict["Word"].append(words[i])
                    wiki_dict["Definition"].append("".join(item))


    for word in words:
        options = wikipedia.search(word.capitalize(), results=10)
        for option in options:
            try:
                summary = wikipedia.summary(option, sentences=1, auto_suggest=False)
                # summary = page.summary
                if len(summary) > 10:
                    wiki_dict["Word"].append(word)
                    wiki_dict["Definition"].append(summary)
            except wikipedia.exceptions.DisambiguationError as e:
                pass
    wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df

['Buffalo', 'Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo', 'Buffalo, New York', 'Buffalo Bills', 'Water buffalo', 'University at Buffalo', 'Buffalo wing', 'Buffalo Springfield', 'South Buffalo, Buffalo', 'Buffalo Bill']




  lis = BeautifulSoup(html).find_all('li')


['Cow (disambiguation)', 'Cow dung', 'Cattle', 'Cowes', 'Cow-Cow Boogie', 'Cow tipping', 'Cow corner', 'Beef', 'First Cow', 'Black cow']
['Goat', 'Goat (disambiguation)', 'Goat (zodiac)', 'Goat meat', 'Mountain goat', 'Goat cheese', 'Gävle goat', 'Fainting goat', 'Goat farming', 'Sea goat']
['Sheep', 'Sheep dog', 'Dolly (sheep)', 'Goat (zodiac)', 'Sheep farming', 'The Sheep', 'Bighorn sheep', "Sheep's milk", 'Shaun the Sheep', 'Lamb and mutton']
['Beam', 'Beam Beam', 'Beam (music)', 'Android Beam', 'Grade beam', 'Balance beam', 'Beam (nautical)', 'Beam bridge', 'Beam (structure)', 'Beam steering']
['Glow', 'Glow On', 'GLOW (TV series)', 'Glow stick', 'Glow-in-the-dark', 'Glow & Lovely', 'Glowworm', 'List of GLOW episodes', 'Gorgeous Ladies of Wrestling', "Glow Up: Britain's Next Make-Up Star"]
['Radiate', 'Radiated tortoise', 'Radiate crown', 'Radiative transfer', 'Radiative equilibrium', 'Effective radiated power', 'Radiate (app)', 'Barbarous radiate', 'Radiate Like This', 'Escape and

Unnamed: 0,Word,Definition
0,BUFFALO,"""Buffalo buffalo Buffalo buffalo buffalo buffa..."
1,BUFFALO,Buffalo is a city in the U.S. state of New Yor...
2,BUFFALO,The Buffalo Bills are a professional American ...
3,BUFFALO,"The water buffalo (Bubalus bubalis), also call..."
4,BUFFALO,"The State University of New York at Buffalo, c..."
...,...,...
135,SEEDY,Seedy Ishmail Njie (born 1 December 1994) is a...
136,SEEDY,Arzette: The Jewel of Faramore is an upcoming ...
137,SEEDY,"Seedy Bah (born July 6, 1992) is a Gambian foo..."
138,SEEDY,Non-Stop Erotic Cabaret is the debut studio al...


In [6]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [7]:
dict_df["Word"] = dict_df["Word"].str.upper()

In [8]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,13608,BEAM,n.,Any large piece of timber or iron long in prop...
1,13609,BEAM,n.,One of the principal horizontal timbers of a b...
2,13610,BEAM,n.,The width of a vessel; as one vessel is said t...
3,13611,BEAM,n.,The bar of a balance from the ends of which th...
4,13612,BEAM,n.,The principal stem or horn of a stag or other ...
...,...,...,...,...
111,168075,VAULT,n.,A leap by aid of the hands or of a pole spring...
112,168078,VAULT,v. t.,To form with a vault or to cover with a vault;...
113,168079,VAULT,v. i.,To leap over; esp. to leap over by aid of the ...
114,168080,VAULT,n.,To leap; to bound; to jump; to spring.


In [36]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

wiki_df

Unnamed: 0,Word,Definition
0,BEAM,of particles moving at approximately equal ve...
1,BEAM,", a penetrating form of high-energy electromag..."
2,BEAM,", streams of electrons observed in discharge t..."
3,BEAM,", a penetrating form of high-energy electromag..."
4,BEAM,of particles moving at approximately equal ve...
...,...,...
111,VAULT,A leap by aid of the hands or of a pole spring...
112,VAULT,To form with a vault or to cover with a vault;...
113,VAULT,To leap over; esp. to leap over by aid of the ...
114,VAULT,To leap; to bound; to jump; to spring.


In [37]:
wiki_df['word_number'] = wiki_df.groupby('Word').cumcount() + 1

wiki_df['Word'] = wiki_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

wiki_df = wiki_df.dropna()

wiki_df

Unnamed: 0,Word,Definition,word_number
0,BEAM_1,of particles moving at approximately equal ve...,1
1,BEAM_2,", a penetrating form of high-energy electromag...",2
2,BEAM_3,", streams of electrons observed in discharge t...",3
3,BEAM_4,", a penetrating form of high-energy electromag...",4
4,BEAM_5,of particles moving at approximately equal ve...,5
...,...,...,...
111,VAULT_43,A leap by aid of the hands or of a pole spring...,43
112,VAULT_44,To form with a vault or to cover with a vault;...,44
113,VAULT_45,To leap over; esp. to leap over by aid of the ...,45
114,VAULT_46,To leap; to bound; to jump; to spring.,46


In [38]:
import numpy as np
import pandas as pd
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm

In [39]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [40]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word           object
Definition     object
word_number     int64
dtype: object

In [41]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embeddings.shape

(515, 384)

In [42]:
matrix = embeddings

In [43]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [44]:
matrix.shape

(515, 384)

In [45]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            sim = cosine_similarity(a, b)/math.dist(a, b)
            if math.isinf(sim):
                sim = 1
            similarities.append([wiki_df.iloc[i]["Word"], wiki_df.iloc[j]["Word"], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

Unnamed: 0,word_1,word_2,similarity
0,BEAM_1,GLOW_1,-0.004880
1,BEAM_1,GLOW_2,-0.007473
2,BEAM_1,GLOW_3,0.050762
3,BEAM_1,GLOW_4,0.037785
4,BEAM_1,GLOW_5,0.016561
...,...,...,...
108992,SHINE_166,VAULT_43,0.007824
108993,SHINE_166,VAULT_44,0.012159
108994,SHINE_166,VAULT_45,0.003385
108995,SHINE_166,VAULT_46,0.010487


In [46]:
df = df[df["similarity"] > 0.03]

df

Unnamed: 0,word_1,word_2,similarity
2,BEAM_1,GLOW_3,0.050762
3,BEAM_1,GLOW_4,0.037785
74,BEAM_1,RADIATE_14,0.074011
221,BEAM_1,SHINE_144,0.096188
295,BEAM_1,RINGS_63,0.030083
...,...,...,...
108959,SHINE_163,VAULT_40,0.045027
108980,SHINE_165,VAULT_41,0.039800
108981,SHINE_165,VAULT_42,0.033651
108982,SHINE_165,VAULT_43,0.034322


In [47]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [48]:
specified_words = list(wiki_df["Word"])

In [49]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [58]:
import heapq

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_n_groups(n, words):
    print(words)
    sim_scores = {}
    for i, a in enumerate(words):
        printProgressBar(iteration=i, total=len(words))
        for j in range(i + 1, len(words)):
            b = words[j]
            if a[0:a.index("_")] == b[0:b.index("_")]:
                continue
            if (a, b) not in relation_dict:
                continue
            for k in range(j + 1, len(words)):
                c = words[k]
                if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                    continue
                if (a, c) not in relation_dict or (b, c) not in relation_dict:
                    continue
                for l in range(k + 1, len(words)):
                    d = specified_words[l]
                    
                    if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                        continue
                    if ((a, d) not in relation_dict) or ((b, d) not in relation_dict) or ((c, d) not in relation_dict):
                        continue
                        
                    sim_scores[(a, b, c, d)] = similarity_4(a, b, c, d)

    sim_heap = []
    for (a, b, c, d), score in sim_scores.items():
        heapq.heappush(sim_heap, (score, [a, b, c, d]))
    
    return heapq.nlargest(n, sim_heap) 
result = find_n_groups(10, specified_words)

['RINGS_1', 'RINGS_2', 'RINGS_3', 'RINGS_4', 'RINGS_5', 'RINGS_6', 'RINGS_7', 'RINGS_8', 'RINGS_9', 'RINGS_10', 'RINGS_11', 'RINGS_12', 'RINGS_13', 'RINGS_14', 'RINGS_15', 'RINGS_16', 'RINGS_17', 'RINGS_18', 'RINGS_19', 'RINGS_20', 'RINGS_21', 'RINGS_22', 'RINGS_23', 'RINGS_24', 'RINGS_25', 'RINGS_26', 'RINGS_27', 'RINGS_28', 'RINGS_29', 'RINGS_30', 'RINGS_31', 'RINGS_32', 'RINGS_33', 'RINGS_34', 'RINGS_35', 'RINGS_36', 'RINGS_37', 'RINGS_38', 'RINGS_39', 'RINGS_40', 'RINGS_41', 'RINGS_42', 'RINGS_43', 'RINGS_44', 'RINGS_45', 'RINGS_46', 'RINGS_47', 'RINGS_48', 'RINGS_49', 'RINGS_50', 'RINGS_51', 'RINGS_52', 'RINGS_53', 'RINGS_54', 'RINGS_55', 'RINGS_56', 'RINGS_57', 'RINGS_58', 'RINGS_59', 'RINGS_60', 'RINGS_61', 'RINGS_62', 'RINGS_63', 'RINGS_64', 'RINGS_65', 'RINGS_66', 'RINGS_67', 'RINGS_68', 'RINGS_69', 'RINGS_70', 'RINGS_71', 'RINGS_72', 'RINGS_73', 'RINGS_74', 'RINGS_75', 'RINGS_76', 'RINGS_77', 'RINGS_78', 'RINGS_79', 'RINGS_80', 'RINGS_81', 'RINGS_82', 'RINGS_83', 'RINGS_84', 

In [51]:
result

[(0.8380656434115308, ['BEAM_51', 'GLOW_62', 'RADIATE_18', 'SHINE_160']),
 (0.8364288142703187, ['BEAM_28', 'RINGS_53', 'VAULT_28', 'HORSE_4']),
 (0.8335392279692238, ['BEAM_51', 'GLOW_62', 'RADIATE_18', 'SHINE_156']),
 (0.8170060981947647, ['BEAM_51', 'GLOW_62', 'RADIATE_21', 'SHINE_156']),
 (0.8119338743459324, ['BEAM_28', 'RINGS_53', 'VAULT_28', 'HORSE_6']),
 (0.8030449858802665, ['BEAM_28', 'RINGS_53', 'VAULT_28', 'COW_5']),
 (0.7889514995418128, ['BEAM_47', 'GLOW_62', 'RADIATE_18', 'SHINE_156']),
 (0.7876360862997536, ['BEAM_51', 'GLOW_62', 'RADIATE_21', 'SHINE_160']),
 (0.7859256511971532, ['BEAM_28', 'RINGS_53', 'VAULT_28', 'HORSE_8']),
 (0.7841915366822296, ['BEAM_28', 'RINGS_53', 'VAULT_28', 'HORSE_10'])]

In [52]:
def remove_words(specified_words, removals):
    for word in removals:
        specified_words = [word1 for word1 in specified_words if word[0:word.index("_")] != word1[0:word1.index("_")]]
        
    return specified_words

In [53]:
from pprint import pprint

In [54]:
specified_words = remove_words(specified_words, result[0][1])

pprint(specified_words)

['RINGS_1',
 'RINGS_2',
 'RINGS_3',
 'RINGS_4',
 'RINGS_5',
 'RINGS_6',
 'RINGS_7',
 'RINGS_8',
 'RINGS_9',
 'RINGS_10',
 'RINGS_11',
 'RINGS_12',
 'RINGS_13',
 'RINGS_14',
 'RINGS_15',
 'RINGS_16',
 'RINGS_17',
 'RINGS_18',
 'RINGS_19',
 'RINGS_20',
 'RINGS_21',
 'RINGS_22',
 'RINGS_23',
 'RINGS_24',
 'RINGS_25',
 'RINGS_26',
 'RINGS_27',
 'RINGS_28',
 'RINGS_29',
 'RINGS_30',
 'RINGS_31',
 'RINGS_32',
 'RINGS_33',
 'RINGS_34',
 'RINGS_35',
 'RINGS_36',
 'RINGS_37',
 'RINGS_38',
 'RINGS_39',
 'RINGS_40',
 'RINGS_41',
 'RINGS_42',
 'RINGS_43',
 'RINGS_44',
 'RINGS_45',
 'RINGS_46',
 'RINGS_47',
 'RINGS_48',
 'RINGS_49',
 'RINGS_50',
 'RINGS_51',
 'RINGS_52',
 'RINGS_53',
 'RINGS_54',
 'RINGS_55',
 'RINGS_56',
 'RINGS_57',
 'RINGS_58',
 'RINGS_59',
 'RINGS_60',
 'RINGS_61',
 'RINGS_62',
 'RINGS_63',
 'RINGS_64',
 'RINGS_65',
 'RINGS_66',
 'RINGS_67',
 'RINGS_68',
 'RINGS_69',
 'RINGS_70',
 'RINGS_71',
 'RINGS_72',
 'RINGS_73',
 'RINGS_74',
 'RINGS_75',
 'RINGS_76',
 'RINGS_77',
 'RINGS_

In [59]:
result = find_n_groups(10, specified_words)

result

['RINGS_1', 'RINGS_2', 'RINGS_3', 'RINGS_4', 'RINGS_5', 'RINGS_6', 'RINGS_7', 'RINGS_8', 'RINGS_9', 'RINGS_10', 'RINGS_11', 'RINGS_12', 'RINGS_13', 'RINGS_14', 'RINGS_15', 'RINGS_16', 'RINGS_17', 'RINGS_18', 'RINGS_19', 'RINGS_20', 'RINGS_21', 'RINGS_22', 'RINGS_23', 'RINGS_24', 'RINGS_25', 'RINGS_26', 'RINGS_27', 'RINGS_28', 'RINGS_29', 'RINGS_30', 'RINGS_31', 'RINGS_32', 'RINGS_33', 'RINGS_34', 'RINGS_35', 'RINGS_36', 'RINGS_37', 'RINGS_38', 'RINGS_39', 'RINGS_40', 'RINGS_41', 'RINGS_42', 'RINGS_43', 'RINGS_44', 'RINGS_45', 'RINGS_46', 'RINGS_47', 'RINGS_48', 'RINGS_49', 'RINGS_50', 'RINGS_51', 'RINGS_52', 'RINGS_53', 'RINGS_54', 'RINGS_55', 'RINGS_56', 'RINGS_57', 'RINGS_58', 'RINGS_59', 'RINGS_60', 'RINGS_61', 'RINGS_62', 'RINGS_63', 'RINGS_64', 'RINGS_65', 'RINGS_66', 'RINGS_67', 'RINGS_68', 'RINGS_69', 'RINGS_70', 'RINGS_71', 'RINGS_72', 'RINGS_73', 'RINGS_74', 'RINGS_75', 'RINGS_76', 'RINGS_77', 'RINGS_78', 'RINGS_79', 'RINGS_80', 'RINGS_81', 'RINGS_82', 'RINGS_83', 'RINGS_84', 

[(0.6249239743587797, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_16']),
 (0.6076357665286503, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_4']),
 (0.5807222706026263, ['RINGS_53', 'VAULT_28', 'FLOOR_3', 'HORSE_4']),
 (0.571612982479853, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'HORSE_16']),
 (0.5674337079925199, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_8']),
 (0.5657858457310665, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_12']),
 (0.5652864262160447, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'FLOOR_11']),
 (0.5633990542802446, ['RINGS_53', 'VAULT_28', 'ENVY_2', 'HORSE_16']),
 (0.5609600504331108, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_8']),
 (0.560735446218461, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_6'])]

In [60]:
specified_words = remove_words(specified_words, result[0][1])

In [64]:
result = find_n_groups(10, specified_words)

result

['BUFFALO_1', 'BUFFALO_2', 'BUFFALO_3', 'BUFFALO_4', 'BUFFALO_5', 'BUFFALO_6', 'COW_1', 'COW_2', 'COW_3', 'COW_4', 'COW_5', 'ENVY_1', 'ENVY_2', 'ENVY_3', 'ENVY_4', 'ENVY_5', 'ENVY_6', 'ENVY_7', 'ENVY_8', 'ENVY_9', 'ENVY_10', 'ENVY_11', 'ENVY_12', 'ENVY_13', 'EXCEL_1', 'EXCEL_2', 'EXCEL_3', 'GOAT_1', 'SEEDY_1', 'SEEDY_2', 'SEEDY_3', 'SHEEP_1', 'SHEEP_2', 'SHEEP_3']
 |█████████████████████████████████████████████████████████████████████████████████████████████████---| 97.1% 

[(0.6249239743587797, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_16']),
 (0.6076357665286503, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_4']),
 (0.5807222706026263, ['RINGS_53', 'VAULT_28', 'FLOOR_3', 'HORSE_4']),
 (0.571612982479853, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'HORSE_16']),
 (0.5674337079925199, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_8']),
 (0.5657858457310665, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_12']),
 (0.5652864262160447, ['RINGS_53', 'VAULT_28', 'ENVY_9', 'FLOOR_11']),
 (0.5633990542802446, ['RINGS_53', 'VAULT_28', 'ENVY_2', 'HORSE_16']),
 (0.5609600504331108, ['RINGS_53', 'VAULT_28', 'FLOOR_11', 'HORSE_8']),
 (0.560735446218461, ['RINGS_53', 'VAULT_28', 'COW_5', 'HORSE_6'])]