In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math
import re
from sentence_transformers import SentenceTransformer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
import heapq
from pprint import pprint
import wikipedia

In [3]:
base_url = "https://en.wikipedia.org/wiki/"

solutions = [
    ["IRIS","LENS","PUPIL","RETINA"],
    ["BOGUS","FAKE","PHONY","SHAM"],
    ["COPY","OUT","OVER","ROGER"],
    ["ALEJANDRO","LOLA","MICHELLE","STAN"],
]

In [4]:
words = []
    
for row in solutions:
    for word in row:
        words.append(word)
        
wiki_dict = {"Word": [], "Definition": []}

In [5]:
from concurrent.futures import ThreadPoolExecutor

In [6]:
summaries = []

def scrape_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    

    paragraphs = soup.find('div', {'id': 'mw-content-text'}).find_all('p')

    summary_element = None
    for paragraph in paragraphs:
        if paragraph.get_text(strip=True): 
            summary_element = paragraph
            break

    summary = summary_element.get_text().strip()

    if len(summary) > 5:
        summaries.append(summary)

In [7]:
for word in words:
    print(word)
    options = wikipedia.search(word.capitalize(), results=10)

    urls = [f'{base_url}{option.replace(" ", "_")}' for option in options]
    
    summaries = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(scrape_page, urls)
    for summary in summaries:
        wiki_dict["Word"].append(word)
        wiki_dict["Definition"].append(summary.strip())

wiki_df = pd.DataFrame(wiki_dict)
    
wiki_df

IRIS
LENS
PUPIL
RETINA
BOGUS
FAKE
PHONY
SHAM
COPY
OUT
OVER
ROGER
ALEJANDRO
LOLA
MICHELLE
STAN


Unnamed: 0,Word,Definition
0,IRIS,Iris (stylized as i☆Ris) is a Japanese idol gi...
1,IRIS,"IRIS² (Infrastructure for Resilience, Intercon..."
2,IRIS,Iris most often refers to:
3,IRIS,Hermodactyloides\nIris\nLimniris\nNepalensis\n...
4,IRIS,"The iris (pl.: irides or irises) is a thin, an..."
...,...,...
155,STAN,"Stan Getz (born Stanley Gayetski, February 2, ..."
156,STAN,Stan (stylized as Stan.) is an Australian over...
157,STAN,"Enos Stanley Kroenke (/ˈkrɒŋki/; born July 29,..."
158,STAN,"Stan Laurel (/ˈlɒrəl/, LORR-əl; born Arthur St..."


In [8]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df["Word"] = dict_df["Word"].str.upper()
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()

dict_df

Unnamed: 0,index,Word,POS,Definition
0,17644,BOGUS,a.,Spurious; fictitious; sham; -- a cant term ori...
1,17645,BOGUS,n.,A liquor made of rum and molasses.
2,33807,COPY,n.,An abundance or plenty of anything.
3,33808,COPY,n.,An imitation transcript or reproduction of an ...
4,33809,COPY,n.,An individual book or a single set of books co...
...,...,...,...,...
67,139440,SHAM,a.,False; counterfeit; pretended; feigned; unreal...
68,139443,SHAM,v. t.,To trick; to cheat; to deceive or delude with ...
69,139444,SHAM,v. t.,To obtrude by fraud or imposition.
70,139445,SHAM,v. t.,To assume the manner and character of; to imit...


In [9]:
combined_words = pd.concat([wiki_df["Word"], dict_df["Word"]])
combined_defs = pd.concat([wiki_df["Definition"], dict_df["Definition"]])

wiki_df = pd.DataFrame({"Word": combined_words, "Definition": combined_defs})

print(wiki_df.shape)
wiki_df

(232, 2)


Unnamed: 0,Word,Definition
0,IRIS,Iris (stylized as i☆Ris) is a Japanese idol gi...
1,IRIS,"IRIS² (Infrastructure for Resilience, Intercon..."
2,IRIS,Iris most often refers to:
3,IRIS,Hermodactyloides\nIris\nLimniris\nNepalensis\n...
4,IRIS,"The iris (pl.: irides or irises) is a thin, an..."
...,...,...
67,SHAM,False; counterfeit; pretended; feigned; unreal...
68,SHAM,To trick; to cheat; to deceive or delude with ...
69,SHAM,To obtrude by fraud or imposition.
70,SHAM,To assume the manner and character of; to imit...


In [10]:
wiki_df["Definition"]

0     Iris (stylized as i☆Ris) is a Japanese idol gi...
1     IRIS² (Infrastructure for Resilience, Intercon...
2                            Iris most often refers to:
3     Hermodactyloides\nIris\nLimniris\nNepalensis\n...
4     The iris (pl.: irides or irises) is a thin, an...
                            ...                        
67    False; counterfeit; pretended; feigned; unreal...
68    To trick; to cheat; to deceive or delude with ...
69                   To obtrude by fraud or imposition.
70    To assume the manner and character of; to imit...
71    To make false pretenses; to deceive; to feign;...
Name: Definition, Length: 232, dtype: object

In [11]:
device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

retriever = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2",
    device = device
)

In [12]:
wiki_df["Definition"] = wiki_df["Definition"].astype(str)
wiki_df.dtypes

Word          object
Definition    object
dtype: object

In [22]:
embeddings = [retriever.encode(defi) for defi in wiki_df['Definition']]
embeddings = np.array(embeddings)

embedding_size = embeddings.shape[1]

In [23]:
embeddings.shape

(232, 384)

In [24]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [47]:
embedding_dict = {}

for word in words:
    embedding_dict[word] = np.ones(embedding_size)

for i, word in enumerate(wiki_df["Word"]):
    curr = embedding_dict[word]
    add = embeddings[i]
    sim = cosine_similarity(curr, add)
    
    print(sim)
    
    total = 1 + 1 / abs(1 + sim)
    embedding_dict[word] = (curr + 1 / sim * add) / total

embedding_dict

-0.0041038329400021325
-0.3956015011752248
-0.5259146432215407
-0.3902044131801236
-0.5034641702420284
-0.6249578316661912
-0.5242949028822745
-0.6525847684575794
-0.4983713358797335
-0.31544952173923785
-0.0014811033555614036
-0.6368456090035493
-0.6998217863360304
-0.5363914658741221
-0.6189316911840594
-0.5795302653496524
-0.7001710956642369
-0.6911206906423294
-0.5948137030575024
-0.44781710958161614
0.011916856599402146
0.11990847767105157
0.5892177569567227
0.5151142808138349
0.32509141547024634
0.5646706014313564
0.42813958684268394
0.4341104479881138
0.5568341235228743
0.22897760375531437
0.019208089097877905
0.5416199004594652
0.790317584092028
0.0992644848321575
0.5041661467999441
0.5085615745492782
0.4213062841097225
0.6111401314692917
0.504167945394185
0.06510038778798785
0.012309345143038669
0.7510138536555053
0.41265671332315434
0.34220718590215937
0.6125985023532958
0.506736116826606
0.5405878749114689
0.27243578530304785
0.5515070637947156
0.519637427375643
0.0007564053

{'IRIS': array([ 1.43272676, -0.54614822, -0.42762186, -0.70813501, -0.90942766,
        -0.32685228, -1.59661307, -0.19360046, -0.42514218,  0.53148838,
        -0.82753781,  0.16813234,  0.59811687,  0.30480123,  0.26173874,
         0.45930998,  1.46709687, -0.43141455, -1.02916184, -1.88864496,
        -2.48752099, -0.37327091,  1.03773724, -0.02480591,  0.75734291,
        -1.26961887,  0.46528424, -0.87091154,  0.22263403,  1.81680088,
        -1.10462188, -1.40841797,  2.59969039, -0.77150964, -0.33857513,
         1.17670419, -0.36991384,  0.29547086, -0.7767183 , -0.72185104,
        -0.14094927, -0.36135994, -0.49012989, -1.72203937,  0.30988915,
         0.13065557, -1.24388466,  0.17240633, -0.0991205 ,  0.49200888,
        -0.07754583,  1.35799294,  0.76321301, -0.48438654, -1.12920548,
         0.26046068,  0.30318184, -0.39059477, -0.12791708,  0.27187033,
        -0.68582957, -0.05741605,  0.46004626, -1.10125317,  1.10268423,
        -0.43246735,  0.99053063,  0.942137

In [48]:
similarities = []

for i in range(len(words)):
    a = embedding_dict[words[i]]
    for j in range(i + 1, len(words)):
        b = embedding_dict[words[j]]
        word1 = wiki_df.iloc[i]["Word"]
        word2 = wiki_df.iloc[j]["Word"]
        
        sim = cosine_similarity(a, b) / (math.dist(a, b))
        if math.isinf(sim):
            sim = 1
        similarities.append([words[i], words[j], sim])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df.sort_values("similarity")

Unnamed: 0,word_1,word_2,similarity
113,ROGER,STAN,-0.028370
70,FAKE,ROGER,-0.020936
112,ROGER,MICHELLE,-0.016501
111,ROGER,LOLA,-0.014425
23,LENS,OVER,-0.012244
...,...,...,...
54,BOGUS,FAKE,0.017240
114,ALEJANDRO,LOLA,0.020917
67,FAKE,COPY,0.023390
68,FAKE,OUT,0.033368


In [49]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]


In [50]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [51]:
def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

def find_groups(words):
    df_dict_scores = {
        'a': [],
        'a_origin': [],
        'b': [],
        'b_origin': [],
        'c': [],
        'c_origin': [],
        'd': [],
        'd_origin': [],
        'sim': [],
    }
    for i, a in enumerate(words):
        printProgressBar(iteration=i, total=len(words))
        for j in range(i + 1, len(words)):
            b = words[j]
            for k in range(j + 1, len(words)):
                c = words[k]
                for l in range(k + 1, len(words)):
                    d = words[l]
                    
                    df_dict_scores["a"].append(a)
                    df_dict_scores["a_origin"].append(a.split('_')[0])
                    df_dict_scores["b"].append(b)
                    df_dict_scores["b_origin"].append(b.split('_')[0])
                    df_dict_scores["c"].append(c)
                    df_dict_scores["c_origin"].append(c.split('_')[0])
                    df_dict_scores["d"].append(d)
                    df_dict_scores["d_origin"].append(d.split('_')[0])
                    df_dict_scores["sim"].append(similarity_4(a, b, c, d))
    
    return pd.DataFrame.from_dict(df_dict_scores)
result = find_groups(words)
result.sort_values('sim', ascending=False)

 |█████████████████████████████████████████████████████████████████████████████████████████████-------| 93.8% 

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
1554,FAKE,FAKE,COPY,COPY,OUT,OUT,OVER,OVER,0.111644
1342,BOGUS,BOGUS,FAKE,FAKE,COPY,COPY,OUT,OUT,0.102254
1526,FAKE,FAKE,SHAM,SHAM,COPY,COPY,OUT,OUT,0.101004
957,PUPIL,PUPIL,FAKE,FAKE,COPY,COPY,OUT,OUT,0.100317
1587,FAKE,FAKE,OUT,OUT,LOLA,LOLA,MICHELLE,MICHELLE,0.099632
...,...,...,...,...,...,...,...,...,...
1809,OVER,OVER,ROGER,ROGER,LOLA,LOLA,STAN,STAN,-0.043258
1485,BOGUS,BOGUS,ROGER,ROGER,MICHELLE,MICHELLE,STAN,STAN,-0.043381
1605,FAKE,FAKE,ROGER,ROGER,MICHELLE,MICHELLE,STAN,STAN,-0.045405
1198,RETINA,RETINA,FAKE,FAKE,ROGER,ROGER,STAN,STAN,-0.045879


In [52]:

def not_one_away(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    df = df[~((df['a_origin'].isin(words)) & (df['b_origin'].isin(words)) & (df['c_origin'].isin(words)))]
    df = df[~((df['b_origin'].isin(words)) & (df['c_origin'].isin(words)) & (df['d_origin'].isin(words)))]
    df = df[~((df['c_origin'].isin(words)) & (df['d_origin'].isin(words)) & (df['a_origin'].isin(words)))]
    df = df[~((df['d_origin'].isin(words)) & (df['a_origin'].isin(words)) & (df['b_origin'].isin(words)))]
    
    return df

def check_win(df):
    row = df.iloc[0]
    words = [row['a_origin'], row['b_origin'], row['c_origin'], row['d_origin']]
    print(words)
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        
    return False
    
def check_one_away(df):
    row = df.iloc[0]
    
    for sol in solutions:
        solution_set = set(sol)
        if (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set):
            return True
        if (row['b_origin'] in solution_set) and (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set):
            return True
        if (row['c_origin'] in solution_set) and (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set):
            return True
        if (row['d_origin'] in solution_set) and (row['a_origin'] in solution_set) and (row['b_origin'] in solution_set):
            return True
        
    return False
    
def after_win(df):
    winning_row = df.iloc[0]
    words = [winning_row['a_origin'], winning_row['b_origin'], winning_row['c_origin'], winning_row['d_origin']]
    words = set(words)
    print(words)
    df = df[~((df['a_origin'].isin(words)) | (df['b_origin'].isin(words)) | (df['c_origin'].isin(words)) | (df['d_origin'].isin(words)))]
    
    return df


In [32]:
answers_df = result.sort_values('sim', ascending=False)

answers_df

Unnamed: 0,a,a_origin,b,b_origin,c,c_origin,d,d_origin,sim
1526,FAKE,FAKE,SHAM,SHAM,COPY,COPY,OUT,OUT,0.331098
1534,FAKE,FAKE,SHAM,SHAM,OUT,OUT,ROGER,ROGER,0.319493
1528,FAKE,FAKE,SHAM,SHAM,COPY,COPY,ROGER,ROGER,0.296684
1533,FAKE,FAKE,SHAM,SHAM,OUT,OUT,OVER,OVER,0.295155
950,PUPIL,PUPIL,FAKE,FAKE,SHAM,SHAM,OUT,OUT,0.294170
...,...,...,...,...,...,...,...,...,...
1271,RETINA,RETINA,COPY,COPY,OUT,OUT,ALEJANDRO,ALEJANDRO,-0.021882
637,LENS,LENS,BOGUS,BOGUS,OUT,OUT,MICHELLE,MICHELLE,-0.023854
1479,BOGUS,BOGUS,OVER,OVER,MICHELLE,MICHELLE,STAN,STAN,-0.026303
1458,BOGUS,BOGUS,OUT,OUT,OVER,OVER,MICHELLE,MICHELLE,-0.026585


In [33]:
answers_df.iloc[0]

a               FAKE
a_origin        FAKE
b               SHAM
b_origin        SHAM
c               COPY
c_origin        COPY
d                OUT
d_origin         OUT
sim         0.331098
Name: 1526, dtype: object

In [34]:
tries = 0
correct = 0
while (tries - correct) < 4 and correct < 3:
    tries += 1
    if check_win(answers_df):
        answers_df = after_win(answers_df)
        correct += 1
    elif not check_one_away(answers_df):
        answers_df = not_one_away(answers_df)
    else:
        answers_df = answers_df.iloc[1:, :]

if correct == 3:
    correct += 1
    tries += 1

print(tries, correct)

['FAKE', 'SHAM', 'COPY', 'OUT']
['FAKE', 'SHAM', 'ROGER', 'STAN']
['LENS', 'FAKE', 'SHAM', 'OVER']
['ROGER', 'LOLA', 'MICHELLE', 'STAN']
4 0
