In [4]:
from sklearn.datasets import fetch_20newsgroups
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import pandas as pd

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
text_data = newsgroups.data  # List of text documents

In [9]:
# Preprocess the text data
tokenized_data = [simple_preprocess(sentence) for sentence in text_data]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_data, vector_size=50, window=5, min_count=5, workers=4)

# Extract the vocabulary
vocabulary = list(model.wv.index_to_key)

# Display the vocabulary
vocab_df = pd.DataFrame(vocabulary, columns=["Word"])

In [12]:
# Find the most similar words to a given word
target_word = "president"  # Replace with your target word
try:
    similar_words = model.wv.most_similar(target_word, topn=10)  # Top 10 most similar words
    print(f"Words most similar to '{target_word}':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
except KeyError:
    print(f"Word '{target_word}' not in vocabulary.")

Words most similar to 'president':
clinton: 0.8553
administration: 0.7917
bill: 0.7679
bush: 0.7528
fbi: 0.7253
secretary: 0.7066
mr: 0.6841
official: 0.6820
senator: 0.6760
justice: 0.6733


In [13]:
import numpy as np

# Function to compute the embedding for a single sentence
def compute_sentence_embedding(sentence, model):
    words = simple_preprocess(sentence)  # Tokenize the sentence
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)  # Return zero vector if no words are in the vocabulary
    return np.mean(word_vectors, axis=0)  # Average word vectors


In [15]:
sentence = "How are you"
# Example: Compute embeddings for all sentences in the dataset
sentence_embeddings = compute_sentence_embedding(sentence, model)
sentence_embeddings

array([-0.60021406, -0.7225048 , -1.3429012 ,  0.81013507, -1.099954  ,
       -0.59514564,  0.9757893 ,  0.56658834, -2.8166597 , -0.04934786,
        0.12872076, -0.69832045,  0.55781406, -0.5861878 ,  0.6258878 ,
       -0.07903194,  0.8966462 ,  1.3026719 , -0.08121755, -1.0223274 ,
       -1.0340422 ,  2.7447214 ,  2.4990604 ,  0.37020826, -0.6636602 ,
       -0.6889419 ,  0.18692493, -1.2099571 , -1.1050545 ,  0.4042643 ,
        0.9875369 , -0.7448816 , -0.6635122 , -1.2843202 , -3.2205698 ,
        0.42511138, -1.714357  ,  0.7835681 ,  0.800467  ,  1.0767416 ,
        0.9121745 ,  0.8806696 ,  1.309316  ,  0.9831435 ,  1.491217  ,
       -0.51828104, -1.9555069 ,  1.6453228 ,  0.7392655 , -0.7711367 ],
      dtype=float32)

In [16]:
from gensim.models import FastText
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd


# Train FastText model
fasttext_model = FastText(sentences=tokenized_data, vector_size=50, window=3, min_count=1, workers=4)



In [17]:
# Function to get most similar words
def get_similar_words(word, model, topn=10):
    try:
        similar_words = model.wv.most_similar(word, topn=topn)
        print(f"Words most similar to '{word}':")
        for similar_word, similarity in similar_words:
            print(f"{similar_word}: {similarity:.4f}")
    except KeyError:
        print(f"Word '{word}' not found in the vocabulary.")
# Example usage: Get similar words
target_word = "data"  # Replace with the word you want to analyze
get_similar_words(target_word, fasttext_model)

Words most similar to 'data':
plot_data: 0.9591
uartdata: 0.9482
okidata: 0.9459
disp_data: 0.9400
xloadimage: 0.9353
datacomm: 0.9323
lexidata: 0.9315
imagesoft: 0.9299
dataset: 0.9289
input_image: 0.9281


In [18]:
# Function to compute sentence embedding
def compute_sentence_embedding(sentence, model):
    words = simple_preprocess(sentence)  # Tokenize the sentence
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)  # Return zero vector if no words are in the vocabulary
    return np.mean(word_vectors, axis=0)  # Average word vectors
sentence = "How are you"
# Example: Compute embeddings for all sentences in the dataset
sentence_embeddings = compute_sentence_embedding(sentence, model)
sentence_embeddings


array([-0.60021406, -0.7225048 , -1.3429012 ,  0.81013507, -1.099954  ,
       -0.59514564,  0.9757893 ,  0.56658834, -2.8166597 , -0.04934786,
        0.12872076, -0.69832045,  0.55781406, -0.5861878 ,  0.6258878 ,
       -0.07903194,  0.8966462 ,  1.3026719 , -0.08121755, -1.0223274 ,
       -1.0340422 ,  2.7447214 ,  2.4990604 ,  0.37020826, -0.6636602 ,
       -0.6889419 ,  0.18692493, -1.2099571 , -1.1050545 ,  0.4042643 ,
        0.9875369 , -0.7448816 , -0.6635122 , -1.2843202 , -3.2205698 ,
        0.42511138, -1.714357  ,  0.7835681 ,  0.800467  ,  1.0767416 ,
        0.9121745 ,  0.8806696 ,  1.309316  ,  0.9831435 ,  1.491217  ,
       -0.51828104, -1.9555069 ,  1.6453228 ,  0.7392655 , -0.7711367 ],
      dtype=float32)

In [19]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Load a pre-trained BERT model (MiniLM)
model = SentenceTransformer('all-MiniLM-L6-v2')


query_embedding = model.encode(sentence, convert_to_tensor=True)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:

unique_words = set(word for sentence in tokenized_data for word in sentence)  # Flatten and deduplicate

# Convert unique words to a list for embedding
vocabulary = list(unique_words)


# Compute embeddings for all unique words in the vocabulary
print("Computing word embeddings for the vocabulary...")
word_embeddings = model.encode(vocabulary, convert_to_tensor=True)



Computing word embeddings for the vocabulary...


In [22]:
import torch
# Function to find similar words
def find_similar_words_bert(word, vocab, word_embeddings, top_n=10):
    try:
        # Compute embedding for the target word
        query_embedding = model.encode(word, convert_to_tensor=True)

        # Compute cosine similarities
        cosine_scores = util.pytorch_cos_sim(query_embedding, word_embeddings)

        # Get top-N similar words
        top_results = torch.topk(cosine_scores, k=top_n)
        print(f"Words most similar to '{word}':")
        for score, idx in zip(top_results.values[0], top_results.indices[0]):
            print(f"{vocab[idx]}: {score:.4f}")
    except Exception as e:
        print(f"Error: {e}")

# Example usage: Find similar words
target_word = "data"  # Replace with your target word
find_similar_words_bert(target_word, vocabulary, word_embeddings, top_n=10)

Words most similar to 'data':
data: 1.0000
datas: 0.8907
dataset: 0.7254
datasets: 0.7054
databank: 0.6697
information: 0.6413
informations: 0.6328
datapoints: 0.6215
datacomm: 0.6108
databook: 0.6030
