In [1]:
import pandas as pd
import numpy as np
import csv
import random
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df1=pd.read_csv("wordle_dataset.csv")
df1.columns

Index(['Unnamed: 0', 'word', 'meaning1', 'meaning2', 'meaning3', 'meaning4',
       'meaning5'],
      dtype='object')

In [3]:
df1 = df1.drop(['Unnamed: 0'],axis=1)

In [4]:
df1.head(5)

Unnamed: 0,word,meaning1,meaning2,meaning3,meaning4,meaning5
0,aachen,a city in western germany near the dutch and b...,formerly it was charlemagnes northern capital,aixlachapelle,aachen,aken
1,aardvark,nocturnal burrowing mammal of the grasslands o...,sole extant representative of the order tubuli...,anteater,ant bear,orycteropus afer
2,aaron,united states professional baseball player who...,old testament elder brother of moses and first...,henry louis aaron,aaron,hank aaron
3,aas,an associate degree in applied science,a dry form of lava resembling clinkers an inte...,associate in arts,aa,associate in applied science alcoholics anonym...
4,abaca,a kind of hemp obtained from the abaca plant i...,philippine banana tree having leafstalks that ...,manila hemp,manilla hemp,musa textilis


In [5]:
len(df1)

21440

In [6]:
#to read from a binary file
import pickle
file=open("bert_embeddings.bin","rb")
f=pickle.load(file)
#print(f)
file.close()

print(f['embedding1'][0])

len(f['embedding1'][0])

In [7]:
df2 = pd.DataFrame(f)
df2.head()

Unnamed: 0,embedding1,embedding2,embedding3,embedding4,embedding5
0,"[0.10552966, -0.2111916, -0.042547125, 0.13038...","[-0.7410908, 0.3037946, 1.0067592, -0.03756384...","[-0.70134115, -0.1560492, 1.8670809, 0.0931303...","[-0.42886475, 0.06789703, 2.4393466, 0.1563847...","[-0.14427099, 0.28605372, 1.3342835, 0.1262898..."
1,"[-0.15250994, 0.59140694, -0.34323326, 0.12957...","[-0.04384717, 0.6120334, 0.790435, 0.06521284,...","[0.009730791, 0.496809, 0.41013932, 0.1373665,...","[0.12894955, 1.4916507, -0.31830812, -0.232148...","[0.09932873, -0.13440955, 1.4096322, 0.3468186..."
2,"[-0.17691979, -0.27761042, -0.4785753, 0.32844...","[0.42689332, 1.4958502, -0.3949241, -0.3236406...","[-0.17629763, 0.86110705, 0.8352665, 0.2713195...","[-0.28148162, 0.32501546, 1.6964121, 0.1526436...","[-0.36929166, 1.0149219, 1.2935079, 0.23599564..."
3,"[-0.1955431, 0.28204358, 1.6014135, 0.2948678,...","[-0.5539297, 1.3073884, 0.76646256, 0.10328976...","[-0.20551097, 0.21470872, 1.7353882, 0.3388543...","[0.21038896, 0.10880705, 2.3038127, 0.29001892...","[-0.21374436, 1.316712, 0.68170404, -0.1837967..."
4,"[-0.08495724, -1.0941188, 0.16890968, 0.784468...","[-0.24078937, -0.09390615, -0.29544133, 0.4669...","[0.09469901, -0.7468935, 0.6064788, 0.5366701,...","[0.012085825, -1.0377014, 1.0099944, 0.3805281...","[-0.24604101, -0.117436886, 1.4289687, -0.1175..."


In [8]:
words_data = pd.concat([df1, df2], axis=1)

In [9]:
#only keep those rows where the length of the word is 5
words_data2 = words_data[words_data['word'].str.len() == 5] 
words_data2 = words_data2.reset_index(drop=True)
words_data2.head(5)

Unnamed: 0,word,meaning1,meaning2,meaning3,meaning4,meaning5,embedding1,embedding2,embedding3,embedding4,embedding5
0,aaron,united states professional baseball player who...,old testament elder brother of moses and first...,henry louis aaron,aaron,hank aaron,"[-0.17691979, -0.27761042, -0.4785753, 0.32844...","[0.42689332, 1.4958502, -0.3949241, -0.3236406...","[-0.17629763, 0.86110705, 0.8352665, 0.2713195...","[-0.28148162, 0.32501546, 1.6964121, 0.1526436...","[-0.36929166, 1.0149219, 1.2935079, 0.23599564..."
1,abaca,a kind of hemp obtained from the abaca plant i...,philippine banana tree having leafstalks that ...,manila hemp,manilla hemp,musa textilis,"[-0.08495724, -1.0941188, 0.16890968, 0.784468...","[-0.24078937, -0.09390615, -0.29544133, 0.4669...","[0.09469901, -0.7468935, 0.6064788, 0.5366701,...","[0.012085825, -1.0377014, 1.0099944, 0.3805281...","[-0.24604101, -0.117436886, 1.4289687, -0.1175..."
2,abase,cause to feel shame,hurt the pride of,humble,mortify,chagrin humiliate,"[-0.06568777, -0.018044014, 1.3753768, 0.29626...","[0.07058689, -0.023161145, 1.4258491, 0.644862...","[-0.03826209, -0.22021341, 1.4970418, 0.335306...","[0.50851375, 0.36827028, 2.4313066, 0.3968379,...","[0.020256568, 0.23802361, 1.9627985, 0.5381185..."
3,abate,make less active or intense,become less in amount or intensity,let up,slack off,die away slake slack,"[0.30710104, -0.19726573, 1.3976676, 0.1641309...","[0.18145579, -0.26938137, 1.3242697, 0.3711263...","[0.17421165, -0.14604211, 2.3641486, 0.2870952...","[0.17828284, -0.14433196, 2.1683278, 0.2864399...","[0.14779839, 0.17784989, 1.7109579, 0.3073285,..."
4,abele,a poplar that is widely cultivated in the unit...,has white bark and leaves with whitish undersu...,white aspen,white poplar,silverleaved poplar populus alba aspen poplar,"[-0.28837916, -0.18204403, 0.3476545, -0.10453...","[0.086071245, -0.24445783, -0.64354473, 0.1595...","[-0.6212785, 0.024624296, -0.50231385, 0.31722...","[-0.22014526, -0.23819354, -0.33803833, 0.1581...","[-0.10784195, 0.50255656, -0.11357013, 0.08043..."


In [10]:
len(words_data2)

2174

In [11]:
# Choose a random word from the dataset
def choose_random_word(df):
    index = random.randint(0,len(df))
    print("index : ",index)
    print("word chosen : ",words_data2["word"][index])
    return df["word"][index]

In [12]:
# Calculate score based on the number of tries
def calculate_score(tries):
    return 15 - (tries - 1)

### HINT 1: SEMANTICALLY SIMILAR WORD

In [13]:
from sentence_transformers import SentenceTransformer

In [14]:
model1 = SentenceTransformer('bert-base-nli-mean-tokens')

In [15]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [16]:
def semantic_similarity_column3(query_vec,df):
    l1 = []
    match3 = {}
    for index,sent in enumerate(df['embedding3']):
        sim = cosine(query_vec, sent)

        if sim>0.75:
            match3[index]=sim
            
    print(len(match3))

    s_match3 = sorted(match3.items(), key=lambda x: x[1], reverse=True)
    top_10_keys3 = [item[0] for item in s_match3[:10]]
    print(top_10_keys3)
    
    for i in top_10_keys3:
        print(df['word'][i])
        l1.append(df['word'][i])
    return l1

In [17]:
def semantic_similarity_column4(query_vec,df):
    l2 = []
    match4 = {}
    for index,sent in enumerate(df['embedding4']):
        sim = cosine(query_vec, sent)
        #print("Sent = ", sent, "; similarity = ", sim)

        if sim>0.75:
            match4[index]=sim

    print(len(match4))

    s_match4 = sorted(match4.items(), key=lambda x: x[1], reverse=True)
    top_10_keys4 = [item[0] for item in s_match4[:10]]
    print(top_10_keys4)
    
    for i in top_10_keys4:
        print(df['word'][i])
        l2.append(df['word'][i])
    return l2

In [18]:
def semantic_similarity_column5(query_vec,df):
    l3 = []
    match5 = {}
    for index,sent in enumerate(df['embedding5']):
        sim = cosine(query_vec, sent)
        #print("Sent = ", sent, "; similarity = ", sim)

        if sim>0.75:
            match5[index]=sim
    print(len(match5))

    s_match5 = sorted(match5.items(), key=lambda x: x[1], reverse=True)
    top_10_keys5 = [item[0] for item in s_match5[:10]]
    print(top_10_keys5)
    
    for i in top_10_keys5:
        print(df['word'][i])
        l3.append(df['word'][i])
    return l3

In [19]:
# Hint 1: Find a word semantically similar to the chosen word
def find_similar_word(query, df):
    query_vec = model1.encode([query])[0]
    l1 = semantic_similarity_column3(query_vec,words_data)
    l2 = semantic_similarity_column4(query_vec,words_data)
    l3 = semantic_similarity_column5(query_vec,words_data)
    
    final_list = l1+l2+l3
    final_list = sorted(list(set(final_list)))
    
    if query in final_list:
        final_list.remove(query)
    print(final_list)
    
    length = len(final_list)
    list_for_hint = []
    for i in range(5):
        random_index = random.randint(0,length-1)
        if final_list[random_index] not in list_for_hint:
            list_for_hint.append(final_list[random_index])
    print(list_for_hint)
    return list_for_hint

In [20]:
# Hint 2: Keyword extraction from the meanings
def extract_keywords(meanings):
    # Combine meanings into a single string
    text = ' '.join(meanings)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    # Calculate frequency distribution of words
    word_freq = Counter(filtered_tokens)
    # Return most common keywords
    return [word for word, _ in word_freq.most_common(5)]

In [21]:
def wordle_game(words_data,words_data2):
    chosen_word = choose_random_word(words_data2)
    tries = 1
    score = 15
    hint_used = {'semantic': False, 'keyword': False}

    print("Welcome to Wordle! You have 7 tries to guess the word.")

    while tries <= 7:
        guess = input(f"\nAttempt {tries}: Enter your guess (or type 'give up' to give up): ").lower()

        if guess == 'give up':
            print(f"Sorry, you've given up. The word was '{chosen_word}'!")
            score -= 8
            break

        if guess == chosen_word.lower():
            print(f"Congratulations! You've guessed the word '{chosen_word}' correctly!")
            score += 10
            break

        score -= 2

        correct_letters = sum(1 for x, y in zip(guess, chosen_word) if x == y)
        misplaced_letters = sum(min(guess.count(letter), chosen_word.count(letter)) for letter in set(guess)) - correct_letters

        correct_letters = 0
        misplaced_letters = 0
        correct_positions = []
        correct_letters_wrong_position = []

        # Check for correct letters in correct positions and correct letters in wrong positions
        for idx, letter in enumerate(guess):
            if letter == chosen_word[idx]:
                correct_letters += 1
                correct_positions.append(letter)
            elif letter in chosen_word:
                misplaced_letters += 1
                correct_letters_wrong_position.append(letter)

        print(f"Correct letters in correct positions: {correct_positions}")
        print(f"Correct letters in wrong positions: {correct_letters_wrong_position}")
        print(f"Your current score: {score}")
        
        '''print(f"Correct letters in correct positions: {correct_letters}")
        print(f"Correct letters in wrong positions: {misplaced_letters}")
        print(f"Your current score: {score}")'''

        tries += 1

        if tries == 2:
            hint_choice = input("Do you want to use a hint? (yes/no): ").lower()
            if hint_choice == 'yes':
                if not hint_used['semantic']:
                    hint_used['semantic'] = True
                    similar_word = find_similar_word(chosen_word, words_data)
                    print(f"\nHint 1: A word similar to the chosen word is '{similar_word}'.")
                else:
                    print("You have already used the semantic similarity hint.")
            else:
                print("No hint used.")

        elif tries > 2:
            hint_choice = input("Do you want to use a hint? (yes/no): ").lower()
            if hint_choice == 'yes':
                if not hint_used['keyword']:
                    hint_used['keyword'] = True
                    print("Hint 2: Keywords extracted from the meanings are:", extract_keywords([chosen_word['meaning1'], chosen_word['meaning2']]))
                else:
                    print("You have already used both hints.")
            else:
                print("No hint used.")

    if tries > 7:
        print(f"\nSorry, you've run out of tries! The word was '{chosen_word}'.")

In [22]:
wordle_game(words_data,words_data2)

index :  1922
word chosen :  thorn
Welcome to Wordle! You have 7 tries to guess the word.

Attempt 1: Enter your guess (or type 'give up' to give up): throb
Correct letters in correct positions: ['t', 'h']
Correct letters in wrong positions: ['r', 'o']
Your current score: 13
Do you want to use a hint? (yes/no): yes
1870
[14508, 14510, 19003, 3039, 3049, 4236, 6616, 8808, 8815, 11606]
pricker
prickle
thorn
center
centre
core
essence
heart
hearts
marrow
1841
[2596, 17696, 181, 1957, 10281, 18768, 14507, 10142, 10512, 12822]
burred
spiny
acerbity
bitterness
jaundice
tartness
prick
inwardness
kernel
nub
231
[4006, 20482, 14817, 8974, 10887, 6769, 20233, 4453, 11713, 18184]
conscience
vellicate
punishable
hind
leipoa
excruciate
unswept
crepe
maze
strive
['acerbity', 'bitterness', 'burred', 'center', 'centre', 'conscience', 'core', 'crepe', 'essence', 'excruciate', 'heart', 'hearts', 'hind', 'inwardness', 'jaundice', 'kernel', 'leipoa', 'marrow', 'maze', 'nub', 'prick', 'pricker', 'prickle',

IndexError: string index out of range