In [1]:
import os
from gensim.models import Word2Vec
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download NLTK tokenizer
nltk.download("punkt")
# Load dataset
dataset = load_dataset("agentlans/high-quality-english-sentences", split="train")
# Debug: Check dataset structure
print("First dataset item:", dataset[0])
# Extract and preprocess sentences
def preprocess_sentences(dataset):
    sentences = []
    for item in dataset:
        sentence = item.get("text", "").strip()  # Use "text" instead of "sentence"
        if sentence:  # Ensure it's not empty
            tokens = word_tokenize(sentence.lower())  # Tokenize and lowercase
            sentences.append(tokens)
    return sentences
# Process sentences
sentences = preprocess_sentences(dataset)
# Ensure we have valid sentences
if len(sentences) == 0:
    raise ValueError("No valid sentences found. Check dataset structure.")
# Train Word2Vec model
print("Building Word2Vec vocabulary...")
model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4, sg=1, epochs=4)
# Define model save path
model_dir = r"C:\Users\tiwar\OneDrive - Amity University\AIML\AIcore\word2vec.model"
os.makedirs(model_dir, exist_ok=True)  # Ensure directory exists
model_path = os.path.join(model_dir, "my_word2vec.model")
# Save the trained model
model.save(model_path)
print(f"Model trained and saved at: {model_path}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tiwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


First dataset item: {'text': 'Soon we dropped into a living forest, where cold-tolerant evergreens and boreal animals still evoke the Canadian heritage of an ecosystem pushed south by glaciers 20,000 years ago.'}
Building Word2Vec vocabulary...
Model trained and saved at: C:\Users\tiwar\OneDrive - Amity University\AIML\AIcore\word2vec.model\my_word2vec.model


In [6]:
# Load the trained Word2Vec model
model = Word2Vec.load(r"C:\Users\tiwar\OneDrive - Amity University\AIML\AIcore\word2vec.model\my_word2vec.model")
# Test the model with a word similarity check
word = "monitor"  # Replace with a word from your dataset
if word in model.wv:
    print(f"Top similar words to '{word}':")
    print(model.wv.most_similar(word, topn=5))  # Show similar words
else:
    print(f"'{word}' not found in vocabulary.")

Top similar words to 'monitor':
[('monitors', 0.7954384684562683), ('monitoring', 0.7879633903503418), ('track', 0.75963294506073), ('assess', 0.7521328330039978), ('evaluate', 0.7421494722366333)]


In [7]:
from gensim.models import Word2Vec
# Load the trained model
model_path = r"C:\Users\tiwar\OneDrive - Amity University\AIML\AIcore\word2vec.model\my_word2vec.model"
model = Word2Vec.load(model_path)
# Function to test similar words
def test_word_similarity(word):
    if word in model.wv.key_to_index:  # Corrected check
        similar_words = model.wv.most_similar(word, topn=5)
        print(f"Top 5 words similar to '{word}':")
        for similar_word, similarity in similar_words:
            print(f"  {similar_word}: {similarity:.4f}")
    else:
        print(f"'{word}' not found in vocabulary.")
# Test words
test_words = ["data", "science", "ai", "learning", "technology"]
for word in test_words:
    test_word_similarity(word)

Top 5 words similar to 'data':
  information: 0.8236
  datasets: 0.7703
  database: 0.7556
  anonymised: 0.7448
  dataset: 0.7434
Top 5 words similar to 'science':
  sciences: 0.8335
  mathematics: 0.8204
  humanities: 0.8082
  engineering: 0.8077
  biology: 0.8061
Top 5 words similar to 'ai':
  robotics: 0.8010
  bi: 0.7737
  rpa: 0.7701
  technology: 0.7632
  ict: 0.7582
Top 5 words similar to 'learning':
  teaching: 0.8454
  game-based: 0.8121
  coaching: 0.7971
  student-centered: 0.7928
  self-directed: 0.7910
Top 5 words similar to 'technology':
  technologies: 0.8429
  game-changing: 0.7941
  exascale: 0.7905
  robotics: 0.7892
  nanotechnology: 0.7809


In [8]:
#Cypher Logic
def analogy(text, model):
    words = word_tokenize(text)
    encrypted_words = []
    for word in words:
        if word in model.wv:
            similar_word = model.wv.most_similar(word, topn=1)[0][0]  # Get the closest word
            encrypted_words.append(similar_word)
        else:
            encrypted_words.append(word)  # If not in vocab, keep the same
    return ' '.join(encrypted_words)
def original(encrypted_text, model):  # Ensure both parameters are included
    words = word_tokenize(encrypted_text)
    decrypted_words = []
    for word in words:
        if word in model.wv:
            original_word = model.wv.most_similar(word, topn=1)[0][0]  # Find nearest neighbor back
            decrypted_words.append(original_word)
        else:
            decrypted_words.append(word)  # Keep same if not found
    
    return ' '.join(decrypted_words)

In [12]:
import random
# Load the trained Word2Vec model
model = Word2Vec.load(r"C:\Users\tiwar\OneDrive - Amity University\AIML\AIcore\word2vec.model\my_word2vec.model")
text = "minute"
encrypted = analogy(text, model)
print("Analogy:", encrypted)
decrypted = original(encrypted, model)
print("Original:", text)

Analogy: seconds
Original: minute


In [None]:
print(len(model.wv))

184204
