In [57]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import heapq
from pprint import pprint
import wikipedia

In [58]:
base_url = "https://en.wikipedia.org/wiki/"

solutions = [
    ["ACT","BILL","MEASURE","RESOLUTION"],
    ["MEADOW","PLAIN","PRAIRE","SAVANNA"],
    ["DIRECT","OPEN","STRAIGHT","FRANK"],
    ["AURA","BUCK","DOGE","HODA"],
]

In [59]:
words = []
    
for row in solutions:
    for word in row:
        words.append(word)
        
wiki_dict = {"Word": [], "Definition": []}

In [60]:
from concurrent.futures import ThreadPoolExecutor

In [61]:
summaries = []

def scrape_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    

    paragraphs = soup.find('div', {'id': 'mw-content-text'}).find_all('p')

    summary_element = None
    for paragraph in paragraphs:
        if paragraph.get_text(strip=True): 
            summary_element = paragraph
            break

    summary = summary_element.get_text().strip()

    if len(summary) > 5:
        summaries.append(summary)

In [62]:
for word in words:
    print(word)
    options = wikipedia.search(word.capitalize(), results=10)

    urls = [f'{base_url}{option.replace(" ", "_")}' for option in options]
    
    summaries = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_page, urls)
    for summary in summaries:
        wiki_dict["Word"].append(word)
        wiki_dict["Definition"].append(summary.strip())

wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df

ACT
BILL
MEASURE
RESOLUTION
MEADOW
PLAIN
PRAIRE
SAVANNA
DIRECT
OPEN
STRAIGHT
FRANK
AURA
BUCK
DOGE
HODA


Unnamed: 0,Word,Definition
0,ACT,"Act, ACT, or The Act may refer to:"
1,ACT,"The Bantu Authorities Act, 1951 (Act No. 68 of..."
2,ACT,"Macbeth (/məkˈbɛθ/, full title The Tragedie of..."
3,ACT,"The Australian Capital Territory (ACT), known ..."
4,ACT,District of Columbia
...,...,...
155,HODA,"Hoda Kotb is an American broadcast journalist,..."
156,HODA,"Hoda's (sometimes Hoda's Lebanese Restaurant,[..."
157,HODA,Today (also called The Today Show) is an Ameri...
158,HODA,Sayyid Ahmad Alamolhoda (also Alam Olhoda or A...


In [63]:
wiki_df["Definition"]

0                     Act, ACT, or The Act may refer to:
1      The Bantu Authorities Act, 1951 (Act No. 68 of...
2      Macbeth (/məkˈbɛθ/, full title The Tragedie of...
3      The Australian Capital Territory (ACT), known ...
4                                   District of Columbia
                             ...                        
155    Hoda Kotb is an American broadcast journalist,...
156    Hoda's (sometimes Hoda's Lebanese Restaurant,[...
157    Today (also called The Today Show) is an Ameri...
158    Sayyid Ahmad Alamolhoda (also Alam Olhoda or A...
159    Hoda Mahmoudi is an American academic and scho...
Name: Definition, Length: 160, dtype: object

In [64]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df["Word"] = dict_df["Word"].str.upper()

In [65]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,1778,ACT,n.,That which is done or doing; the exercise of p...
1,1779,ACT,n.,The result of public deliberation; the decisio...
2,1780,ACT,n.,A formal solemn writing expressing that someth...
3,1781,ACT,n.,A performance of part of a play; one of the pr...
4,1782,ACT,n.,A thesis maintained in public in some English ...
...,...,...,...,...
159,149943,STRAIGHT,superl.,Unmixed; undiluted; as to take liquor straight.
160,149944,STRAIGHT,superl.,Making no exceptions or deviations in one's su...
161,149945,STRAIGHT,adv.,In a straight manner; directly; rightly; forth...
162,149946,STRAIGHT,n.,A hand of five cards in consecutive order as t...


In [66]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

wiki_df

Unnamed: 0,Word,Definition
0,ACT,"Act, ACT, or The Act may refer to:"
1,ACT,"The Bantu Authorities Act, 1951 (Act No. 68 of..."
2,ACT,"Macbeth (/məkˈbɛθ/, full title The Tragedie of..."
3,ACT,"The Australian Capital Territory (ACT), known ..."
4,ACT,District of Columbia
...,...,...
159,STRAIGHT,Unmixed; undiluted; as to take liquor straight.
160,STRAIGHT,Making no exceptions or deviations in one's su...
161,STRAIGHT,In a straight manner; directly; rightly; forth...
162,STRAIGHT,A hand of five cards in consecutive order as t...


In [67]:
wiki_df['word_number'] = wiki_df.groupby('Word').cumcount() + 1

wiki_df['Word'] = wiki_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

wiki_df = wiki_df.dropna()

wiki_df

Unnamed: 0,Word,Definition,word_number
0,ACT_1,"Act, ACT, or The Act may refer to:",1
1,ACT_2,"The Bantu Authorities Act, 1951 (Act No. 68 of...",2
2,ACT_3,"Macbeth (/məkˈbɛθ/, full title The Tragedie of...",3
3,ACT_4,"The Australian Capital Territory (ACT), known ...",4
4,ACT_5,District of Columbia,5
...,...,...,...
159,STRAIGHT_16,Unmixed; undiluted; as to take liquor straight.,16
160,STRAIGHT_17,Making no exceptions or deviations in one's su...,17
161,STRAIGHT_18,In a straight manner; directly; rightly; forth...,18
162,STRAIGHT_19,A hand of five cards in consecutive order as t...,19


In [68]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [69]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word           object
Definition     object
word_number     int64
dtype: object

In [70]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embeddings.shape

(324, 384)

In [71]:
matrix = embeddings

In [72]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [73]:
matrix.shape

(324, 384)

In [74]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            sim = cosine_similarity(a, b)/math.dist(a, b)
            if math.isinf(sim):
                sim = 1
            similarities.append([wiki_df.iloc[i]["Word"], wiki_df.iloc[j]["Word"], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df.sort_values("similarity")

Unnamed: 0,word_1,word_2,similarity
10272,RESOLUTION_7,AURA_7,-0.034950
26622,STRAIGHT_5,MEASURE_28,-0.034448
18749,PRAIRE_9,STRAIGHT_14,-0.034213
7794,MEASURE_8,STRAIGHT_2,-0.032719
16299,PLAIN_10,BUCK_7,-0.031874
...,...,...,...
44869,FRANK_15,OPEN_15,0.154477
44896,FRANK_15,PLAIN_18,0.156827
43308,DIRECT_12,OPEN_15,0.169004
36880,ACT_12,RESOLUTION_14,0.171604


In [75]:
df = df[df["similarity"] > 0.03]

df

Unnamed: 0,word_1,word_2,similarity
0,ACT_1,BILL_1,0.075563
1,ACT_1,BILL_2,0.034965
7,ACT_1,BILL_8,0.038013
10,ACT_1,MEASURE_1,0.051406
11,ACT_1,MEASURE_2,0.039450
...,...,...,...
48651,RESOLUTION_18,STRAIGHT_13,0.042002
48654,RESOLUTION_18,STRAIGHT_16,0.031225
48656,RESOLUTION_18,STRAIGHT_18,0.038648
48657,RESOLUTION_18,STRAIGHT_19,0.030412


In [76]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [77]:
specified_words = list(wiki_df["Word"])

In [78]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [79]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_groups(words):
    df_dict_scores = {
        'a': [],
        'a_origin': [],
        'b': [],
        'b_origin': [],
        'c': [],
        'c_origin': [],
        'd': [],
        'd_origin': [],
        'sim': [],
    }
    for i, a in enumerate(words):
        printProgressBar(iteration=i, total=len(words))
        for j in range(i + 1, len(words)):
            b = words[j]
            if a[0:a.index("_")] == b[0:b.index("_")]:
                continue
            if (a, b) not in relation_dict:
                continue
            for k in range(j + 1, len(words)):
                c = words[k]
                if a[0:a.index("_")] == c[0:c.index("_")] or b[0:b.index("_")] == c[0:c.index("_")]:
                    continue
                if (a, c) not in relation_dict or (b, c) not in relation_dict:
                    continue
                for l in range(k + 1, len(words)):
                    d = specified_words[l]
                    
                    if a[0:a.index("_")] == d[0:d.index("_")] or b[0:b.index("_")] == d[0:d.index("_")] or c[0:c.index("_")] == d[0:d.index("_")]:
                        continue
                    if ((a, d) not in relation_dict) or ((b, d) not in relation_dict) or ((c, d) not in relation_dict):
                        continue
                    
                    df_dict_scores["a"].append(a)
                    df_dict_scores["a_origin"].append(a.split('_')[0])
                    df_dict_scores["b"].append(b)
                    df_dict_scores["b_origin"].append(b.split('_')[0])
                    df_dict_scores["c"].append(c)
                    df_dict_scores["c_origin"].append(c.split('_')[0])
                    df_dict_scores["d"].append(d)
                    df_dict_scores["d_origin"].append(d.split('_')[0])
                    df_dict_scores["sim"].append(similarity_4(a, b, c, d))
    
    return pd.DataFrame.from_dict(df_dict_scores)
result = find_groups(specified_words)
result

 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 99.7% 

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
0,ACT_1,ACT,BILL_1,BILL,MEASURE_1,MEASURE,RESOLUTION_7,RESOLUTION,0.340152
1,ACT_1,ACT,BILL_1,BILL,MEASURE_1,MEASURE,DIRECT_1,DIRECT,0.334019
2,ACT_1,ACT,BILL_1,BILL,MEASURE_1,MEASURE,DIRECT_16,DIRECT,0.309558
3,ACT_1,ACT,BILL_1,BILL,MEASURE_1,MEASURE,DIRECT_18,DIRECT,0.312853
4,ACT_1,ACT,BILL_1,BILL,MEASURE_1,MEASURE,DIRECT_19,DIRECT,0.317839
...,...,...,...,...,...,...,...,...,...
203393,OPEN_31,OPEN,PLAIN_27,PLAIN,RESOLUTION_16,RESOLUTION,STRAIGHT_18,STRAIGHT,0.244790
203394,OPEN_31,OPEN,PLAIN_27,PLAIN,RESOLUTION_17,RESOLUTION,STRAIGHT_16,STRAIGHT,0.273936
203395,OPEN_33,OPEN,PLAIN_14,PLAIN,RESOLUTION_16,RESOLUTION,STRAIGHT_18,STRAIGHT,0.247683
203396,OPEN_33,OPEN,PLAIN_26,PLAIN,RESOLUTION_16,RESOLUTION,STRAIGHT_18,STRAIGHT,0.233420


In [94]:

def not_one_away(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)))]
    df = df[~((df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    df = df[~((df['c_origin'].isin(words)) & (df['d_origin'].isin(words)) & (df['a_origin'].isin(words)))]
    df = df[~((df['d_origin'].isin(words)) & (df['a_origin'].isin(words)) & (df['b_origin'].isin(words)))]
    
    return df

def check_win(df):
    row = df.iloc[0]
    words = [row['a_origin'], row['b_origin'], row['c_origin'], row['d_origin']]
    print(words)
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        
    return False
    
def check_one_away(df):
    row = df.iloc[0]
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set):
            return True
        if (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        if (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set):
            return True
        if (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set):
            return True
        
    return False
    
def after_win(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) | (df['b_origin'].isin(words)) | (df['c_origin'].isin(words)) | (df['d_origin'].isin(words)))]
    
    return df

def remove_top(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    
    return df

In [95]:
answers_df = result.sort_values('sim', ascending=False)

answers_df

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
164241,DIRECT_12,DIRECT,FRANK_15,FRANK,OPEN_15,OPEN,PLAIN_18,PLAIN,0.875811
164240,DIRECT_12,DIRECT,FRANK_15,FRANK,OPEN_15,OPEN,PLAIN_17,PLAIN,0.781222
164264,DIRECT_12,DIRECT,FRANK_15,FRANK,OPEN_16,OPEN,PLAIN_18,PLAIN,0.777285
164243,DIRECT_12,DIRECT,FRANK_15,FRANK,OPEN_15,OPEN,PLAIN_20,PLAIN,0.769380
164253,DIRECT_12,DIRECT,FRANK_15,FRANK,OPEN_15,OPEN,STRAIGHT_15,STRAIGHT,0.756975
...,...,...,...,...,...,...,...,...,...
150907,BUCK_12,BUCK,OPEN_21,OPEN,PLAIN_14,PLAIN,STRAIGHT_20,STRAIGHT,0.193022
10415,RESOLUTION_7,RESOLUTION,DIRECT_5,DIRECT,ACT_18,ACT,OPEN_27,OPEN,0.193008
73788,ACT_18,ACT,BILL_19,BILL,OPEN_11,OPEN,PLAIN_13,PLAIN,0.192355
202975,OPEN_21,OPEN,PLAIN_17,PLAIN,RESOLUTION_16,RESOLUTION,STRAIGHT_18,STRAIGHT,0.191681


In [96]:
answers_df.iloc[0]

a           DIRECT_12
a_origin       DIRECT
b            FRANK_15
b_origin        FRANK
c             OPEN_15
c_origin         OPEN
d            PLAIN_18
d_origin        PLAIN
sim          0.875811
Name: 164241, dtype: object

In [97]:
tries = 0
correct = 0
while (tries - correct) < 4 and correct < 3:
    tries += 1
    if check_win(answers_df):
        answers_df = after_win(answers_df)
        correct += 1
    elif not check_one_away(answers_df):
        answers_df = not_one_away(answers_df)
    else:
        answers_df = remove_top(answers_df)

if correct == 3:
    correct += 1
    tries += 1

print(tries, correct)

['DIRECT', 'FRANK', 'OPEN', 'PLAIN']
['DIRECT', 'FRANK', 'OPEN', 'STRAIGHT']
['ACT', 'BILL', 'MEASURE', 'RESOLUTION']
['MEADOW', 'PRAIRE', 'PLAIN', 'SAVANNA']
5 4
