In [95]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import heapq
from pprint import pprint
import wikipedia

In [96]:
base_url = "https://en.wikipedia.org/wiki/"

# solutions = [
#     ["ACT","BILL","MEASURE","RESOLUTION"],
#     ["MEADOW","PLAIN","PRAIRE","SAVANNA"],
#     ["DIRECT","OPEN","STRAIGHT","FRANK"],
#     ["AURA","BUCK","DOGE","HODA"],
# ]
solutions = [['HAIL', 'RAIN', 'SLEET', 'SNOW'], ['BUCKS', 'HEAT', 'JAZZ', 'NETS'], ['OPTION', 'RETURN', 'SHIFT', 'TAB'], ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']]
# solutions = [['IRIS', 'LENS', 'PUPIL', 'RETINA'], ['BOGUS', 'FAKE', 'PHONY', 'SHAM'], ['COPY', 'OUT', 'OVER', 'ROGER'], ['ALEJANDRO', 'LOLA', 'MICHELLE', 'STAN']]

In [97]:
words = []
    
for row in solutions:
    for word in row:
        words.append(word)
        
wiki_dict = {"Word": [], "Definition": []}

In [98]:
from concurrent.futures import ThreadPoolExecutor

In [99]:
summaries = []

def scrape_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    

    paragraphs = soup.find('div', {'id': 'mw-content-text'}).find_all('p')

    summary_element = None
    for paragraph in paragraphs:
        if paragraph.get_text(strip=True): 
            summary_element = paragraph
            break

    summary = summary_element.get_text().strip()
    summaries.append(summary[:min(len(summary), 368)])

In [100]:
for word in words:
    print(word)
    options = wikipedia.search(word.capitalize(), results=3)

    urls = [f'{base_url}{option.replace(" ", "_")}' for option in options]
    
    summaries = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_page, urls)
    for summary in summaries:
        wiki_dict["Word"].append(word)
        wiki_dict["Definition"].append(summary.strip())

wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df

HAIL
RAIN
SLEET
SNOW
BUCKS
HEAT
JAZZ
NETS
OPTION
RETURN
SHIFT
TAB
KAYAK
LEVEL
MOM
RACECAR


Unnamed: 0,Word,Definition
0,HAIL,"Madison Knisely (born September 9, 2003) is an..."
1,HAIL,"""Hail, Hail"" is a song by the American rock ba..."
2,HAIL,Hail is a form of solid precipitation.[1] It i...
3,RAIN,"""Rain, Rain, Rain"" is a song, originally relea..."
4,RAIN,Rain is a type of precipitation in which liqui...
5,RAIN,Rain is water droplets that have condensed fro...
6,SLEET,Sleet is a regionally variant term for some me...
7,SLEET,"Donald Clayborn Sleet (November 27, 1938 – Dec..."
8,SLEET,"Kimsuky (also known as Velvet Chollima, Black ..."
9,SNOW,"SNOW 1.0, SNOW 2.0, and SNOW 3G are word-based..."


In [101]:
wiki_df["Definition"]

0     Madison Knisely (born September 9, 2003) is an...
1     "Hail, Hail" is a song by the American rock ba...
2     Hail is a form of solid precipitation.[1] It i...
3     "Rain, Rain, Rain" is a song, originally relea...
4     Rain is a type of precipitation in which liqui...
5     Rain is water droplets that have condensed fro...
6     Sleet is a regionally variant term for some me...
7     Donald Clayborn Sleet (November 27, 1938 – Dec...
8     Kimsuky (also known as Velvet Chollima, Black ...
9     SNOW 1.0, SNOW 2.0, and SNOW 3G are word-based...
10    SNoW (pronounced: "Snow"; born June 11, 1985, ...
11    Snow comprises individual ice crystals that gr...
12                                  Bucks may refer to:
13    Bucks Fizz were an English pop group that achi...
14    The Milwaukee Bucks are an American profession...
15    High-explosive anti-tank (HEAT) is the effect ...
16    WWE Heat (formerly known as Sunday Night Heat ...
17    In thermodynamics, heat is the thermal ene

In [102]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df["Word"] = dict_df["Word"].str.upper()

In [103]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,69556,HAIL,n.,Small roundish masses of ice precipitated from...
1,69559,HAIL,v. i.,To pour down particles of ice or frozen vapors.
2,69560,HAIL,v. t.,To pour forcibly down as hail.
3,69561,HAIL,a.,Healthy. See Hale (the preferable spelling).
4,69562,HAIL,v. t.,To call loudly to or after; to accost; to salu...
...,...,...,...,...
112,154835,TAB,n.,The flap or latchet of a shoe fastened with a ...
113,154836,TAB,n.,A tag. See Tag 2.
114,154837,TAB,n.,A loop for pulling or lifting something.
115,154838,TAB,n.,A border of lace or other material worn on the...


In [104]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

wiki_df

Unnamed: 0,Word,Definition
0,HAIL,"Madison Knisely (born September 9, 2003) is an..."
1,HAIL,"""Hail, Hail"" is a song by the American rock ba..."
2,HAIL,Hail is a form of solid precipitation.[1] It i...
3,RAIN,"""Rain, Rain, Rain"" is a song, originally relea..."
4,RAIN,Rain is a type of precipitation in which liqui...
...,...,...
112,TAB,The flap or latchet of a shoe fastened with a ...
113,TAB,A tag. See Tag 2.
114,TAB,A loop for pulling or lifting something.
115,TAB,A border of lace or other material worn on the...


In [105]:
len(combined_words)

165

In [106]:
wiki_df['word_number'] = wiki_df.groupby('Word').cumcount() + 1

wiki_df['Word'] = wiki_df.apply(lambda row: f"{row['Word']}_{row['word_number']}", axis=1)

wiki_df = wiki_df.dropna()

wiki_df

Unnamed: 0,Word,Definition,word_number
0,HAIL_1,"Madison Knisely (born September 9, 2003) is an...",1
1,HAIL_2,"""Hail, Hail"" is a song by the American rock ba...",2
2,HAIL_3,Hail is a form of solid precipitation.[1] It i...,3
3,RAIN_1,"""Rain, Rain, Rain"" is a song, originally relea...",1
4,RAIN_2,Rain is a type of precipitation in which liqui...,2
...,...,...,...
112,TAB_4,The flap or latchet of a shoe fastened with a ...,4
113,TAB_5,A tag. See Tag 2.,5
114,TAB_6,A loop for pulling or lifting something.,6
115,TAB_7,A border of lace or other material worn on the...,7


In [107]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [108]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word           object
Definition     object
word_number     int64
dtype: object

In [109]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embeddings.shape

(165, 384)

In [110]:
matrix = embeddings

In [111]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [112]:
matrix.shape

(165, 384)

In [113]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i + 1, len(matrix)):
        b = matrix[j]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        if word1[0: word1.index("_")] != word2[0: word2.index("_")]:
            sim = cosine_similarity(a, b)/math.dist(a, b)
            if math.isinf(sim):
                sim = 1
            similarities.append([wiki_df.iloc[i]["Word"], wiki_df.iloc[j]["Word"], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df.sort_values("similarity")

Unnamed: 0,word_1,word_2,similarity
1321,SLEET_3,RETURN_5,-0.030152
1337,SLEET_3,RETURN_21,-0.029636
5651,MOM_1,HEAT_8,-0.026910
119,HAIL_1,RETURN_28,-0.026736
4933,TAB_3,SHIFT_5,-0.026375
...,...,...,...
750,RAIN_2,SLEET_5,0.130002
12118,SLEET_6,SNOW_8,0.131397
9551,LEVEL_11,RETURN_34,0.132724
7154,HAIL_11,RETURN_14,0.146892


In [114]:
df

Unnamed: 0,word_1,word_2,similarity
0,HAIL_1,RAIN_1,0.017160
1,HAIL_1,RAIN_2,-0.002123
2,HAIL_1,RAIN_3,-0.005823
3,HAIL_1,SLEET_1,-0.008033
4,HAIL_1,SLEET_2,0.014969
...,...,...,...
12144,SNOW_8,TAB_4,0.034558
12145,SNOW_8,TAB_5,0.014479
12146,SNOW_8,TAB_6,0.056042
12147,SNOW_8,TAB_7,0.022320


In [115]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [116]:
specified_words = list(wiki_df["Word"])

In [117]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [118]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_groups(words):
    df_dict_scores = {
        'a': [],
        'a_origin': [],
        'b': [],
        'b_origin': [],
        'c': [],
        'c_origin': [],
        'd': [],
        'd_origin': [],
        'sim': [],
    }
    
    prefixes = [a.split('_')[0] for a in words]
    
    for i, a in enumerate(words):
        for j in range(i + 1, len(words)):
            if prefixes[j] == prefixes[i]:
                continue
            b = words[j]
            for k in range(j + 1, len(words)):
                if prefixes[k] in set([prefixes[i], prefixes[j]]):
                    continue
                c = words[k]
                for l in range(k + 1, len(words)):
                    if prefixes[l] in set([prefixes[i], prefixes[j], prefixes[k]]):
                        continue
                    d = words[l]
                    
                    df_dict_scores["a"].append(a)
                    df_dict_scores["a_origin"].append(prefixes[i])
                    df_dict_scores["b"].append(b)
                    df_dict_scores["b_origin"].append(prefixes[j])
                    df_dict_scores["c"].append(c)
                    df_dict_scores["c_origin"].append(prefixes[k])
                    df_dict_scores["d"].append(d)
                    df_dict_scores["d_origin"].append(prefixes[l])
                    df_dict_scores["sim"].append(similarity_4(a, b, c, d))
    
    return pd.DataFrame.from_dict(df_dict_scores)
result = find_groups(specified_words)
result

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
0,HAIL_1,HAIL,RAIN_1,RAIN,SLEET_1,SLEET,SNOW_1,SNOW,0.096195
1,HAIL_1,HAIL,RAIN_1,RAIN,SLEET_1,SLEET,SNOW_2,SNOW,0.161458
2,HAIL_1,HAIL,RAIN_1,RAIN,SLEET_1,SLEET,SNOW_3,SNOW,0.157648
3,HAIL_1,HAIL,RAIN_1,RAIN,SLEET_1,SLEET,BUCKS_1,BUCKS,0.070513
4,HAIL_1,HAIL,RAIN_1,RAIN,SLEET_1,SLEET,BUCKS_2,BUCKS,0.084566
...,...,...,...,...,...,...,...,...,...
15402178,SHIFT_16,SHIFT,SLEET_6,SLEET,SNOW_8,SNOW,TAB_4,TAB,0.228730
15402179,SHIFT_16,SHIFT,SLEET_6,SLEET,SNOW_8,SNOW,TAB_5,TAB,0.181103
15402180,SHIFT_16,SHIFT,SLEET_6,SLEET,SNOW_8,SNOW,TAB_6,TAB,0.246980
15402181,SHIFT_16,SHIFT,SLEET_6,SLEET,SNOW_8,SNOW,TAB_7,TAB,0.208644


In [119]:

def not_one_away(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)))]
    df = df[~((df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    df = df[~((df['c_origin'].isin(words)) & (df['d_origin'].isin(words)) & (df['a_origin'].isin(words)))]
    df = df[~((df['d_origin'].isin(words)) & (df['a_origin'].isin(words)) & (df['b_origin'].isin(words)))]
    
    return df

def check_win(df):
    row = df.iloc[0]
    words = [row['a_origin'], row['b_origin'], row['c_origin'], row['d_origin']]
    print(words)
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        
    return False
    
def check_one_away(df):
    row = df.iloc[0]
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set):
            return True
        if (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        if (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set):
            return True
        if (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set):
            return True
        
    return False
    
def after_win(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) | (df['b_origin'].isin(words)) | (df['c_origin'].isin(words)) | (df['d_origin'].isin(words)))]
    
    return df

def remove_top(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    
    return df

In [120]:
answers_df = result.sort_values('sim', ascending=False)

answers_df

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
823270,HAIL_3,HAIL,RAIN_3,RAIN,SLEET_5,SLEET,SNOW_5,SNOW,0.649698
814384,HAIL_3,HAIL,RAIN_2,RAIN,SLEET_5,SLEET,SNOW_5,SNOW,0.639942
815228,HAIL_3,HAIL,RAIN_3,RAIN,SNOW_3,SNOW,SLEET_5,SLEET,0.633880
1184116,HAIL_3,HAIL,RAIN_6,RAIN,SLEET_5,SLEET,SNOW_5,SNOW,0.624297
806342,HAIL_3,HAIL,RAIN_2,RAIN,SNOW_3,SNOW,SLEET_5,SLEET,0.612695
...,...,...,...,...,...,...,...,...,...
3659825,SNOW_1,SNOW,BUCKS_3,BUCKS,MOM_1,MOM,HEAT_5,HEAT,-0.080930
3659875,SNOW_1,SNOW,BUCKS_3,BUCKS,MOM_1,MOM,RETURN_5,RETURN,-0.081390
3659844,SNOW_1,SNOW,BUCKS_3,BUCKS,MOM_1,MOM,LEVEL_6,LEVEL,-0.085613
3659915,SNOW_1,SNOW,BUCKS_3,BUCKS,MOM_1,MOM,SHIFT_14,SHIFT,-0.086798


In [121]:
answers_df.iloc[0]

a             HAIL_3
a_origin        HAIL
b             RAIN_3
b_origin        RAIN
c            SLEET_5
c_origin       SLEET
d             SNOW_5
d_origin        SNOW
sim         0.649698
Name: 823270, dtype: object

In [122]:
tries = 0
correct = 0
while (tries - correct) < 4 and correct < 3:
    tries += 1
    if check_win(answers_df):
        answers_df = after_win(answers_df)
        correct += 1
    elif not check_one_away(answers_df):
        answers_df = not_one_away(answers_df)
    else:
        answers_df = remove_top(answers_df)

if correct == 3:
    correct += 1
    tries += 1

print(tries, correct)

['HAIL', 'RAIN', 'SLEET', 'SNOW']
['LEVEL', 'RETURN', 'SHIFT', 'TAB']
['LEVEL', 'OPTION', 'RETURN', 'SHIFT']
['HEAT', 'LEVEL', 'RETURN', 'SHIFT']
['KAYAK', 'LEVEL', 'RETURN', 'TAB']
5 1
