In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer
import random

In [2]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.300d.txt", 
               word2vec_output_file="gensim_glove_vectors.txt")

  glove2word2vec(glove_input_file="glove.6B.300d.txt",


(400000, 300)

In [3]:
# Load GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('gensim_glove_vectors.txt', 
                                                binary=False)

In [4]:
# Load fastText embeddings
# Load a pre-trained FastText model
fasttext_model = KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

In [5]:
# Load BERT model
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [6]:
distilbert_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [7]:
# Load your dataset
data = pd.read_csv('wordle_dataset.csv')
data = data.dropna()

In [8]:
# Preprocess dataset
words = data['word'].values.tolist()
synonyms = data.drop(columns=['Unnamed: 0','word','meaning1','meaning2','meaning5']).values.flatten().tolist()
unique_synonyms = list(set(synonyms))
all_words = words + unique_synonyms

In [9]:
glove_embeddings = np.array([glove_model[word] if word in glove_model else np.zeros(glove_model.vector_size) for word in all_words])

In [10]:
fasttext_embeddings = np.array([fasttext_model[word] if word in fasttext_model else np.zeros(fasttext_model.vector_size) for word in all_words])

In [11]:
bert_embeddings = bert_model.encode(all_words)

In [12]:
distil_embeddings = distilbert_model.encode(all_words)

In [13]:
# Perform clustering separately for GloVe, fastText, and BERT embeddings
def perform_clustering(embeddings, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(embeddings)
    return kmeans, clusters

In [14]:
kmeans_glove, clusters_glove = perform_clustering(glove_embeddings)



In [15]:
kmeans_fasttext, clusters_fasttext = perform_clustering(fasttext_embeddings)



In [16]:
kmeans_bert, clusters_bert = perform_clustering(bert_embeddings)



In [17]:
kmeans_distil, clusters_distil = perform_clustering(distil_embeddings)



# Choose a word at random
random_word = random.choice(words)

In [18]:
random_word = 'accelerate'

In [19]:
# Embed the random word using GloVe, fastText, and BERT
random_word_glove_embedding = glove_model[random_word] if random_word in glove_model else np.zeros(glove_model.vector_size)

In [20]:
random_word_fasttext_embedding = fasttext_model[random_word] if random_word in fasttext_model else np.zeros(fasttext_model.vector_size)

In [21]:
random_word_bert_embedding = bert_model.encode([random_word])[0]

In [22]:
random_word_distil_embedding = distilbert_model.encode([random_word])[0]

In [23]:
# Find top 10 words closest to the chosen word based on the embeddings in each cluster
def find_similar_words_in_clusters(embeddings, cluster_labels, chosen_word_embedding, all_words, k=10):
    similar_words = []
    for cluster_id in range(cluster_labels.max() + 1):
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        cluster_embeddings = embeddings[cluster_indices]
        cluster_words = np.array(all_words)[cluster_indices]
        distances = np.linalg.norm(cluster_embeddings - chosen_word_embedding, axis=1)
        closest_indices = np.argsort(distances)[:k]
        similar_words.extend(cluster_words[closest_indices])
    return similar_words

In [24]:
similar_words_glove = find_similar_words_in_clusters(glove_embeddings, clusters_glove, 
                                                     random_word_glove_embedding, 
                                                     all_words)

In [25]:
similar_words_fasttext = find_similar_words_in_clusters(fasttext_embeddings, clusters_fasttext, 
                                                        random_word_fasttext_embedding,
                                                        all_words)

In [26]:
similar_words_bert = find_similar_words_in_clusters(bert_embeddings, clusters_bert, 
                                                    random_word_bert_embedding,
                                                    all_words)

In [27]:
similar_words_distil = find_similar_words_in_clusters(distil_embeddings, clusters_distil, 
                                                    random_word_distil_embedding,
                                                    all_words)

In [28]:
# Compare the similar words returned by the three embeddings
print("Random Word:", random_word)

Random Word: accelerate


In [29]:
similar_words_glove = list(set(similar_words_glove))
similar_words_fasttext = list(set(similar_words_fasttext))
similar_words_bert = list(set(similar_words_bert))
similar_words_distil = list(set(similar_words_distil))

In [30]:
print("Top 10 Similar Words (GloVe):", similar_words_glove)

Top 10 Similar Words (GloVe): ['sir james dewar', 'oddment', 'push', 'constrain', 'boost', 'accelerate', 'blood relation', 'facilitate', 'mental rejection', 'decelerate', 'impede', 'hasten', 'enable', 'step in', 'slow', 'conjunctivitis arida', 'invigorate', 'nystan', 'jack london', 'sea puss', 'plimsoll mark', 'retard', 'accelerated', 'stymie']


In [31]:
print("Top 10 Similar Words (fastText):", similar_words_fasttext)

Top 10 Similar Words (fastText): ['button quail', 'accelerate', 'intensify', 'coss', 'reduce', 'tormentor', 'artocarpus communis', 'facilitate', 'accelerated', 'initiate', 'pachouli', 'highrisk', 'visavis', 'hasten', 'increase', 'aachen', 'sir hiram stevens maxim', 'log zs']


In [32]:
print("Top 10 Similar Words (BERT):", similar_words_bert)

Top 10 Similar Words (BERT): ['speed up', 'rev up', 'boost', 'accelerate', 'drive', 'veering', 'friction', 'scrambled', 'multiplied', 'motored', 'quicken', 'forwardlooking', 'thrust', 'expand', 'airheaded', 'speed', 'jerking', 'speeding', 'quickening', 'frenzy', 'accelerated', 'superhighway', 'embolden']


In [33]:
print("Top 10 Similar Words (Distil):", similar_words_distil)

Top 10 Similar Words (Distil): ['speed up', 'burgeon forth', 'rev up', 'frantic', 'rolling wave', 'swashbuckling', 'supercharge', 'rushed', 'accelerate', 'faster', 'accelerator', 'fuss', 'push forward', 'runtiness', 'wideawake', 'inflame', 'expand', 'commotion', 'increase', 'joggle', 'frenzy', 'accelerated', 'westward']


In [34]:
# Compare the similar words returned by the three embeddings
print("Random Word:", random_word)
print(len(similar_words_glove))
print(len(similar_words_fasttext))
print(len(similar_words_bert))


Random Word: accelerate
24
18
23
