In [53]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import heapq
from pprint import pprint
import wikipedia

In [54]:
base_url = "https://en.wikipedia.org/wiki/"

solutions = [
    ["IRIS","LENS","PUPIL","RETINA"],
    ["BOGUS","FAKE","PHONY","SHAM"],
    ["COPY","OUT","OVER","ROGER"],
    ["ALEJANDRO","LOLA","MICHELLE","STAN"],
]

In [55]:
words = []
    
for row in solutions:
    for word in row:
        words.append(word)
        
wiki_dict = {"Word": [], "Definition": []}

In [56]:
from concurrent.futures import ThreadPoolExecutor

In [57]:
summaries = []

def scrape_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    

    paragraphs = soup.find('div', {'id': 'mw-content-text'}).find_all('p')

    summary_element = None
    for paragraph in paragraphs:
        if paragraph.get_text(strip=True): 
            summary_element = paragraph
            break

    summary = summary_element.get_text().strip()

    if len(summary) > 5:
        summaries.append(summary)

In [58]:
for word in words:
    print(word)
    options = wikipedia.search(word.capitalize(), results=10)

    urls = [f'{base_url}{option.replace(" ", "_")}' for option in options]
    
    summaries = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_page, urls)
    for summary in summaries:
        wiki_dict["Word"].append(word)
        wiki_dict["Definition"].append(summary.strip())

wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df

IRIS
LENS
PUPIL
RETINA
BOGUS
FAKE
PHONY
SHAM
COPY
OUT
OVER
ROGER
ALEJANDRO
LOLA
MICHELLE
STAN


Unnamed: 0,Word,Definition
0,IRIS,"IRIS² (Infrastructure for Resilience, Intercon..."
1,IRIS,Iris (stylized as i☆Ris) is a Japanese idol gi...
2,IRIS,Iris most often refers to:
3,IRIS,Hermodactyloides\nIris\nLimniris\nNepalensis\n...
4,IRIS,Subgenus Iris is one subgenus of Iris.
...,...,...
155,STAN,"""Stan"" is a song by American rapper Eminem, wi..."
156,STAN,Stan (stylized as Stan.) is an Australian over...
157,STAN,"Enos Stanley Kroenke (/ˈkrɒŋki/; born July 29,..."
158,STAN,"Stan Laurel (/ˈlɒrəl/, LORR-əl; born Arthur St..."


In [59]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df["Word"] = dict_df["Word"].str.upper()
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()

dict_df

Unnamed: 0,index,Word,POS,Definition
0,17644,BOGUS,a.,Spurious; fictitious; sham; -- a cant term ori...
1,17645,BOGUS,n.,A liquor made of rum and molasses.
2,33807,COPY,n.,An abundance or plenty of anything.
3,33808,COPY,n.,An imitation transcript or reproduction of an ...
4,33809,COPY,n.,An individual book or a single set of books co...
...,...,...,...,...
67,139440,SHAM,a.,False; counterfeit; pretended; feigned; unreal...
68,139443,SHAM,v. t.,To trick; to cheat; to deceive or delude with ...
69,139444,SHAM,v. t.,To obtrude by fraud or imposition.
70,139445,SHAM,v. t.,To assume the manner and character of; to imit...


In [60]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

print(wiki_df.shape)
wiki_df

(232, 2)


Unnamed: 0,Word,Definition
0,IRIS,"IRIS² (Infrastructure for Resilience, Intercon..."
1,IRIS,Iris (stylized as i☆Ris) is a Japanese idol gi...
2,IRIS,Iris most often refers to:
3,IRIS,Hermodactyloides\nIris\nLimniris\nNepalensis\n...
4,IRIS,Subgenus Iris is one subgenus of Iris.
...,...,...
67,SHAM,False; counterfeit; pretended; feigned; unreal...
68,SHAM,To trick; to cheat; to deceive or delude with ...
69,SHAM,To obtrude by fraud or imposition.
70,SHAM,To assume the manner and character of; to imit...


In [61]:
wiki_df["Definition"]

0     IRIS² (Infrastructure for Resilience, Intercon...
1     Iris (stylized as i☆Ris) is a Japanese idol gi...
2                            Iris most often refers to:
3     Hermodactyloides\nIris\nLimniris\nNepalensis\n...
4                Subgenus Iris is one subgenus of Iris.
                            ...                        
67    False; counterfeit; pretended; feigned; unreal...
68    To trick; to cheat; to deceive or delude with ...
69                   To obtrude by fraud or imposition.
70    To assume the manner and character of; to imit...
71    To make false pretenses; to deceive; to feign;...
Name: Definition, Length: 232, dtype: object

In [80]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'
# device = torch.device("mps")

retriever = SentenceTransformer(
    "all-mpnet-base-v2",
    device = device
)

In [81]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word          object
Definition    object
dtype: object

In [82]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embedding_size = embeddings.shape[1]

In [65]:
embeddings.shape

(232, 384)

In [66]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [67]:
embedding_dict = {}

for word in words:
    embedding_dict[word] = np.ones(embedding_size)

for i, word in enumerate(wiki_df["Word"]):
    curr = embedding_dict[word]
    add = embeddings[i]
    sim = cosine_similarity(curr, add)
    
    print(sim)
    
    total = 1 + 1 / abs(1 + sim)
    embedding_dict[word] = (curr + 1 / sim * add) / total

embedding_dict

0.01313447290399524
0.5396645652342845
0.6078276979676053
0.20639421122089316
0.5375735769356178
0.5868751665309941
0.6213677939288893
0.19155053904331132
0.285612102766732
0.5976590656354275
-0.008817858598196893
-0.24592465113638456
-0.47489606957890435
-0.46350462785165175
-0.524194544755676
-0.501592254041128
-0.6169217600650475
-0.5806051019322972
-0.5495460150808236
-0.3405767046640632
0.011915728585321118
0.1426629338120754
0.7342565990318254
0.5355443183302349
0.22540215408032355
0.6172574054540453
0.3412569540765022
0.5351342656549504
0.235454640336418
0.5011315979075007
0.014819314819591643
0.4044301511049559
0.8085584833986429
0.12154899662907385
0.47582469070035516
0.41316531612125956
0.5722704890881419
0.554115857215852
0.6943463512706212
-0.06277907702463817
-0.0026329294759268986
-0.7255577405817099
-0.540128124620681
-0.3516709177602262
-0.5721870919393529
-0.5540262914219979
-0.5176397701396483
-0.39258489433751587
-0.48268778457065026
-0.5046263239778727
0.01566481235

{'IRIS': array([-2.03321955e-01, -1.22808297e-02, -1.96614951e-01,  2.32431742e-02,
        -1.15349694e-01, -9.02484959e-02,  1.47314429e-01,  5.95069927e-02,
         2.34241290e-01, -1.83894495e-01,  3.73804458e-01, -1.63496672e-01,
        -8.70813827e-02, -9.03406675e-03, -1.36706881e-01, -9.81658596e-02,
         7.12480145e-02,  1.32545681e-01, -2.66123138e-02,  1.41210627e-02,
         1.74287551e-01,  1.38505935e-01,  7.90180940e-03,  5.75012303e-02,
        -6.21146015e-02, -1.57363000e-01,  3.61953534e-02,  1.25116201e-02,
         2.82820552e-01, -2.70322801e-01,  6.43711436e-02,  3.72852349e-01,
        -2.24812479e-01, -1.11678892e-01, -6.54611837e-02, -1.30640350e-02,
         1.39697140e-02,  3.12920722e-02,  1.03075705e-01,  2.89886858e-02,
        -2.91775474e-03, -1.01159811e-01,  1.21721277e-01,  1.57362871e-01,
         8.24226395e-02, -5.22030617e-02,  1.63222397e-01,  1.39517366e-01,
        -1.27203532e-01, -1.68582320e-01, -9.98753886e-02, -1.42283890e-01,
    

In [68]:
similarities = []

for i in range(len(words)):
    a = embedding_dict[words[i]]
    for j in range(i + 1, len(words)):
        b = embedding_dict[words[j]]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        
        sim = cosine_similarity(a, b) / (math.dist(a, b))
        if math.isinf(sim):
            sim = 1
        similarities.append([words[i], words[j], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df.sort_values("similarity")

Unnamed: 0,word_1,word_2,similarity
85,SHAM,OUT,-0.156491
68,FAKE,OUT,-0.103862
18,LENS,FAKE,-0.096247
20,LENS,SHAM,-0.083639
92,COPY,OUT,-0.081900
...,...,...,...
4,IRIS,FAKE,0.092990
117,LOLA,MICHELLE,0.107917
67,FAKE,COPY,0.182521
84,SHAM,COPY,0.245059


In [69]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [70]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [71]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_groups(words):
    df_dict_scores = {
        'a': [],
        'a_origin': [],
        'b': [],
        'b_origin': [],
        'c': [],
        'c_origin': [],
        'd': [],
        'd_origin': [],
        'sim': [],
    }
    for i, a in enumerate(words):
        printProgressBar(iteration=i, total=len(words))
        for j in range(i + 1, len(words)):
            b = words[j]
            for k in range(j + 1, len(words)):
                c = words[k]
                for l in range(k + 1, len(words)):
                    d = words[l]
                    
                    df_dict_scores["a"].append(a)
                    df_dict_scores["a_origin"].append(a.split('_')[0])
                    df_dict_scores["b"].append(b)
                    df_dict_scores["b_origin"].append(b.split('_')[0])
                    df_dict_scores["c"].append(c)
                    df_dict_scores["c_origin"].append(c.split('_')[0])
                    df_dict_scores["d"].append(d)
                    df_dict_scores["d_origin"].append(d.split('_')[0])
                    df_dict_scores["sim"].append(similarity_4(a, b, c, d))
    
    return pd.DataFrame.from_dict(df_dict_scores)
result = find_groups(words)
result.sort_values('sim', ascending=False)

 |█████████████████████████████████████████████████████████████████████████████████████████████-------| 93.8% 

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
1527,FAKE,FAKE,SHAM,SHAM,COPY,COPY,OVER,OVER,1.055645
949,PUPIL,PUPIL,FAKE,FAKE,SHAM,SHAM,COPY,COPY,1.033485
299,IRIS,IRIS,FAKE,FAKE,SHAM,SHAM,COPY,COPY,1.032896
1528,FAKE,FAKE,SHAM,SHAM,COPY,COPY,ROGER,ROGER,0.900166
1532,FAKE,FAKE,SHAM,SHAM,COPY,COPY,STAN,STAN,0.851139
...,...,...,...,...,...,...,...,...,...
56,IRIS,IRIS,LENS,LENS,SHAM,SHAM,OUT,OUT,-0.240776
564,LENS,LENS,RETINA,RETINA,SHAM,SHAM,OUT,OUT,-0.242456
742,LENS,LENS,SHAM,SHAM,OUT,OUT,OVER,OVER,-0.247673
905,PUPIL,PUPIL,BOGUS,BOGUS,SHAM,SHAM,OUT,OUT,-0.249218


In [72]:

def not_one_away(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)))]
    df = df[~((df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    df = df[~((df['c_origin'].isin(words)) & (df['d_origin'].isin(words)) & (df['a_origin'].isin(words)))]
    df = df[~((df['d_origin'].isin(words)) & (df['a_origin'].isin(words)) & (df['b_origin'].isin(words)))]
    
    return df

def check_win(df):
    row = df.iloc[0]
    words = [row['a_origin'], row['b_origin'], row['c_origin'], row['d_origin']]
    print(words)
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        
    return False
    
def check_one_away(df):
    row = df.iloc[0]
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set):
            return True
        if (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        if (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set):
            return True
        if (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set):
            return True
        
    return False
    
def after_win(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    print(words)
    df = df[~((df['a_origin'].isin(words)) | (df['b_origin'].isin(words)) | (df['c_origin'].isin(words)) | (df['d_origin'].isin(words)))]
    
    return df


In [73]:
answers_df = result.sort_values('sim', ascending=False)

answers_df

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
1527,FAKE,FAKE,SHAM,SHAM,COPY,COPY,OVER,OVER,1.055645
949,PUPIL,PUPIL,FAKE,FAKE,SHAM,SHAM,COPY,COPY,1.033485
299,IRIS,IRIS,FAKE,FAKE,SHAM,SHAM,COPY,COPY,1.032896
1528,FAKE,FAKE,SHAM,SHAM,COPY,COPY,ROGER,ROGER,0.900166
1532,FAKE,FAKE,SHAM,SHAM,COPY,COPY,STAN,STAN,0.851139
...,...,...,...,...,...,...,...,...,...
56,IRIS,IRIS,LENS,LENS,SHAM,SHAM,OUT,OUT,-0.240776
564,LENS,LENS,RETINA,RETINA,SHAM,SHAM,OUT,OUT,-0.242456
742,LENS,LENS,SHAM,SHAM,OUT,OUT,OVER,OVER,-0.247673
905,PUPIL,PUPIL,BOGUS,BOGUS,SHAM,SHAM,OUT,OUT,-0.249218


In [74]:
answers_df.iloc[0]

a               FAKE
a_origin        FAKE
b               SHAM
b_origin        SHAM
c               COPY
c_origin        COPY
d               OVER
d_origin        OVER
sim         1.055645
Name: 1527, dtype: object

In [75]:
tries = 0
correct = 0
while (tries - correct) < 4 and correct < 3:
    tries += 1
    if check_win(answers_df):
        answers_df = after_win(answers_df)
        correct += 1
    elif not check_one_away(answers_df):
        answers_df = not_one_away(answers_df)
    else:
        answers_df = answers_df.iloc[1:, :]

if correct == 3:
    correct += 1
    tries += 1

print(tries, correct)

['FAKE', 'SHAM', 'COPY', 'OVER']
['IRIS', 'PUPIL', 'FAKE', 'SHAM']
['FAKE', 'SHAM', 'LOLA', 'MICHELLE']
['FAKE', 'SHAM', 'ROGER', 'STAN']
4 0
