In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import heapq
from pprint import pprint
import wikipedia

In [33]:
base_url = "https://en.wikipedia.org/wiki/"

solutions = [
    ['ANACONDA', 'CAPYBARA', 'JAGUAR', 'TOUCAN'],
    ['BASE', 'BOTTOM', 'FOOT', 'FOUNDATION'],
    ['COMPANY', 'GREASE', 'HAIR', 'RENT'],
    ['CHANGE', 'CUCUMBER', 'LEGS', 'LION']
]

In [34]:
words = []
    
for row in solutions:
    for word in row:
        words.append(word)
        
wiki_dict = {"Word": [], "Definition": []}

In [35]:
wiki_dict = {"Word": [], "Definition": []}
soups = [BeautifulSoup(requests.get(f'{base_url}{word.capitalize()}').content, 'html.parser') for word in words]

# for i, soup in enumerate(soups):
#     content = soup.find("div", id="mw-content-text")

#     first_para = content.find("p").text

#     items = [item.text for item in content.find_all("li")]

#     if f'may refer to:' in first_para:
#         for item in items:
#             item = item.lower()
#             if words[i] in item:
#                 item = item.sub(words[i].lower(), "")

#             if len(item) > 5:
#                 wiki_dict["Word"].append(words[i])
#                 wiki_dict["Definition"].append("".join(item))


for word in words:
    options = wikipedia.search(word.capitalize(), results=10)
    for option in options:
        try:
            summary = wikipedia.summary(option, sentences=1, auto_suggest=False)
            # summary = page.summary
            if len(summary) > 10:
                wiki_dict["Word"].append(word)
                wiki_dict["Definition"].append(summary)
        except wikipedia.exceptions.DisambiguationError as e:
            pass
wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df

Unnamed: 0,Word,Definition
0,ANACONDA,Anacondas or water boas are a group of large s...
1,ANACONDA,Anaconda is a 1997 American adventure horror f...
2,ANACONDA,"The green anaconda (Eunectes murinus), also kn..."
3,ANACONDA,Anaconda is a distribution of the Python and R...
4,ANACONDA,Anacondas: The Hunt for the Blood Orchid is a ...
...,...,...
133,LION,The Lion King is a 2019 American musical drama...
134,LION,"PT Lion Mentari Airlines, operating as Lion Ai..."
135,LION,"The Asiatic lion, also known as the Persian li..."
136,LION,The Detroit Lions are a professional American ...


In [36]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df["Word"] = dict_df["Word"].str.upper()

In [37]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,5627,ANACONDA,n.,A large South American snake of the Boa family...
1,12983,BASE,a.,Of little or less than the usual height; of lo...
2,12984,BASE,a.,Low in place or position.
3,12985,BASE,a.,Of humble birth; or low degree; lowly; mean.
4,12986,BASE,a.,Illegitimate by birth; bastard.
...,...,...,...,...
139,128549,RENT,n.,To grant the possession and enjoyment of for a...
140,128550,RENT,n.,To take and hold under an agreement to pay ren...
141,128551,RENT,v. i.,To be leased or let for rent; as an estate ren...
142,160241,TOUCAN,n.,Any one of numerous species of fruit-eating bi...


In [38]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

wiki_df

Unnamed: 0,Word,Definition
0,ANACONDA,Anacondas or water boas are a group of large s...
1,ANACONDA,Anaconda is a 1997 American adventure horror f...
2,ANACONDA,"The green anaconda (Eunectes murinus), also kn..."
3,ANACONDA,Anaconda is a distribution of the Python and R...
4,ANACONDA,Anacondas: The Hunt for the Blood Orchid is a ...
...,...,...
139,RENT,To grant the possession and enjoyment of for a...
140,RENT,To take and hold under an agreement to pay ren...
141,RENT,To be leased or let for rent; as an estate ren...
142,TOUCAN,Any one of numerous species of fruit-eating bi...


In [39]:
wiki_df['word_number'] = wiki_df.groupby('Word').cumcount() + 1

wiki_df['Word'] = wiki_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

wiki_df = wiki_df.dropna()

wiki_df

Unnamed: 0,Word,Definition,word_number
0,ANACONDA_1,Anacondas or water boas are a group of large s...,1
1,ANACONDA_2,Anaconda is a 1997 American adventure horror f...,2
2,ANACONDA_3,"The green anaconda (Eunectes murinus), also kn...",3
3,ANACONDA_4,Anaconda is a distribution of the Python and R...,4
4,ANACONDA_5,Anacondas: The Hunt for the Blood Orchid is a ...,5
...,...,...,...
139,RENT_19,To grant the possession and enjoyment of for a...,19
140,RENT_20,To take and hold under an agreement to pay ren...,20
141,RENT_21,To be leased or let for rent; as an estate ren...,21
142,TOUCAN_10,Any one of numerous species of fruit-eating bi...,10


In [40]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [41]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word           object
Definition     object
word_number     int64
dtype: object

In [42]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embeddings.shape

(282, 384)

In [43]:
matrix = embeddings

In [44]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [45]:
matrix.shape

(282, 384)

In [46]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            sim = cosine_similarity(a, b)/math.dist(a, b)
            if math.isinf(sim):
                sim = 1
            similarities.append([wiki_df.iloc[i]["Word"], wiki_df.iloc[j]["Word"], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

Unnamed: 0,word_1,word_2,similarity
0,ANACONDA_1,CAPYBARA_1,0.070674
1,ANACONDA_1,CAPYBARA_2,0.064560
2,ANACONDA_1,CAPYBARA_3,0.013825
3,ANACONDA_1,CAPYBARA_4,0.010231
4,ANACONDA_1,CAPYBARA_5,0.069738
...,...,...,...
36529,RENT_19,TOUCAN_11,0.005468
36530,RENT_20,TOUCAN_10,0.000125
36531,RENT_20,TOUCAN_11,0.004140
36532,RENT_21,TOUCAN_10,-0.010334


In [47]:
df = df[df["similarity"] > 0.03]

df

Unnamed: 0,word_1,word_2,similarity
0,ANACONDA_1,CAPYBARA_1,0.070674
1,ANACONDA_1,CAPYBARA_2,0.064560
4,ANACONDA_1,CAPYBARA_5,0.069738
5,ANACONDA_1,CAPYBARA_6,0.042046
7,ANACONDA_1,CAPYBARA_8,0.066151
...,...,...,...
36452,JAGUAR_10,LION_12,0.030453
36466,JAGUAR_10,TOUCAN_10,0.043537
36467,JAGUAR_10,TOUCAN_11,0.035157
36480,LION_11,TOUCAN_10,0.045276


In [48]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [49]:
specified_words = list(wiki_df["Word"])

In [50]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [51]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_groups(words):
    df_dict_scores = {
        'a': [],
        'a_origin': [],
        'b': [],
        'b_origin': [],
        'c': [],
        'c_origin': [],
        'd': [],
        'd_origin': [],
        'sim': [],
    }
    for i, a in enumerate(words):
        printProgressBar(iteration=i, total=len(words))
        for j in range(i + 1, len(words)):
            b = words[j]
            if a[0:a.index("_")] == b[0:b.index("_")]:
                continue
            if (a, b) not in relation_dict:
                continue
            for k in range(j + 1, len(words)):
                c = words[k]
                if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                    continue
                if (a, c) not in relation_dict or (b, c) not in relation_dict:
                    continue
                for l in range(k + 1, len(words)):
                    d = specified_words[l]
                    
                    if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                        continue
                    if ((a, d) not in relation_dict) or ((b, d) not in relation_dict) or ((c, d) not in relation_dict):
                        continue
                    
                    df_dict_scores["a"].append(a)
                    df_dict_scores["a_origin"].append(a.split('_')[0])
                    df_dict_scores["b"].append(b)
                    df_dict_scores["b_origin"].append(b.split('_')[0])
                    df_dict_scores["c"].append(c)
                    df_dict_scores["c_origin"].append(c.split('_')[0])
                    df_dict_scores["d"].append(d)
                    df_dict_scores["d_origin"].append(d.split('_')[0])
                    df_dict_scores["sim"].append(similarity_4(a, b, c, d))
    
    return pd.DataFrame.from_dict(df_dict_scores)
result = find_groups(specified_words)
result

 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.6% 

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
0,ANACONDA_1,ANACONDA,CAPYBARA_1,CAPYBARA,JAGUAR_1,JAGUAR,LION_1,LION,0.460984
1,ANACONDA_1,ANACONDA,CAPYBARA_1,CAPYBARA,JAGUAR_1,JAGUAR,LION_3,LION,0.337420
2,ANACONDA_1,ANACONDA,CAPYBARA_1,CAPYBARA,JAGUAR_1,JAGUAR,LION_5,LION,0.305026
3,ANACONDA_1,ANACONDA,CAPYBARA_1,CAPYBARA,JAGUAR_1,JAGUAR,LION_8,LION,0.363921
4,ANACONDA_1,ANACONDA,CAPYBARA_1,CAPYBARA,JAGUAR_1,JAGUAR,LION_10,LION,0.382301
...,...,...,...,...,...,...,...,...,...
18054,GREASE_11,GREASE,HAIR_13,HAIR,JAGUAR_10,JAGUAR,LION_11,LION,0.445298
18055,GREASE_11,GREASE,HAIR_15,HAIR,JAGUAR_10,JAGUAR,LION_11,LION,0.428464
18056,HAIR_12,HAIR,JAGUAR_10,JAGUAR,LION_11,LION,TOUCAN_10,TOUCAN,0.407938
18057,HAIR_14,HAIR,JAGUAR_10,JAGUAR,LION_11,LION,TOUCAN_10,TOUCAN,0.455744


In [64]:

def not_one_away(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)))]
    df = df[~((df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    df = df[~((df['c_origin'].isin(words)) & (df['d_origin'].isin(words)) & (df['a_origin'].isin(words)))]
    df = df[~((df['d_origin'].isin(words)) & (df['a_origin'].isin(words)) & (df['b_origin'].isin(words)))]
    
    return df

def check_win(df):
    row = df.iloc[0]
    words = [row['a_origin'], row['b_origin'], row['c_origin'], row['d_origin']]
    print(words)
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        
    return False
    
def check_one_away(df):
    row = df.iloc[0]
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set):
            return True
        if (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        if (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set):
            return True
        if (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set):
            return True
        
    return False
    
def after_win(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    print(words)
    df = df[~((df['a_origin'].isin(words)) | (df['b_origin'].isin(words)) | (df['c_origin'].isin(words)) | (df['d_origin'].isin(words)))]
    
    return df

In [66]:
answers_df = result.sort_values('sim', ascending=False)

answers_df

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
8033,BASE_19,BASE,BOTTOM_9,BOTTOM,FOOT_14,FOOT,FOUNDATION_11,FOUNDATION,0.701404
9705,BASE_22,BASE,BOTTOM_7,BOTTOM,FOOT_13,FOOT,FOUNDATION_12,FOUNDATION,0.681579
5280,LEGS_2,LEGS,BOTTOM_7,BOTTOM,FOOT_13,FOOT,FOUNDATION_12,FOUNDATION,0.680250
11092,BASE_36,BASE,BOTTOM_7,BOTTOM,FOOT_13,FOOT,FOUNDATION_12,FOUNDATION,0.669843
8982,BASE_20,BASE,BOTTOM_9,BOTTOM,FOOT_14,FOOT,FOUNDATION_11,FOUNDATION,0.660932
...,...,...,...,...,...,...,...,...,...
10249,BASE_27,BASE,BOTTOM_17,BOTTOM,FOOT_24,FOOT,RENT_13,RENT,0.194645
9779,BASE_22,BASE,BOTTOM_23,BOTTOM,FOOT_12,FOOT,HAIR_14,HAIR,0.193939
2949,FOUNDATION_4,FOUNDATION,BASE_23,BASE,BOTTOM_17,BOTTOM,FOOT_16,FOOT,0.193909
9783,BASE_22,BASE,CHANGE_19,CHANGE,COMPANY_20,COMPANY,FOOT_12,FOOT,0.193493


In [67]:
answers_df.iloc[0]

a                 BASE_19
a_origin             BASE
b                BOTTOM_9
b_origin           BOTTOM
c                 FOOT_14
c_origin             FOOT
d           FOUNDATION_11
d_origin       FOUNDATION
sim              0.701404
Name: 8033, dtype: object

In [68]:
tries = 0
correct = 0
while (tries - correct) < 4 and correct < 3:
    tries += 1
    if check_win(answers_df):
        answers_df = after_win(answers_df)
        correct += 1
    elif not check_one_away(answers_df):
        answers_df = not_one_away(answers_df)
    else:
        answers_df = answers_df.iloc[1:, :]

if correct == 3:
    correct += 1
    tries += 1

print(tries, correct)

['BASE', 'BOTTOM', 'FOOT', 'FOUNDATION']
{'FOOT', 'FOUNDATION', 'BASE', 'BOTTOM'}
['CAPYBARA', 'JAGUAR', 'LION', 'ANACONDA']
['ANACONDA', 'CAPYBARA', 'JAGUAR', 'LION']
['JAGUAR', 'LION', 'ANACONDA', 'CAPYBARA']
['CAPYBARA', 'ANACONDA', 'JAGUAR', 'LION']
5 1
