In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors

In [None]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world we live in.",
    "Deep learning techniques have greatly improved image recognition.",
    "Natural language processing allows computers to understand human language.",
    "Data science combines statistics, computer science, and domain knowledge.",
    "The weather is nice today, perfect for a walk in the park.",
    "Cats are often seen as independent and curious creatures.",
    "The stock market fluctuates based on various economic indicators.",
    "Exploring new cuisines can be an exciting culinary adventure.",
    "Machine learning algorithms can learn from data and make predictions.",
]

In [7]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text): 
    stop_words = set(stopwords.words('english'))
    tokens  = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return tokens

In [8]:
preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences]

In [9]:
preprocessed_sentences

[['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'],
 ['artificial', 'intelligence', 'transforming', 'world', 'live'],
 ['deep',
  'learning',
  'techniques',
  'greatly',
  'improved',
  'image',
  'recognition'],
 ['natural',
  'language',
  'processing',
  'allows',
  'computers',
  'understand',
  'human',
  'language'],
 ['data',
  'science',
  'combines',
  'statistics',
  'computer',
  'science',
  'domain',
  'knowledge'],
 ['weather', 'nice', 'today', 'perfect', 'walk', 'park'],
 ['cats', 'often', 'seen', 'independent', 'curious', 'creatures'],
 ['stock',
  'market',
  'fluctuates',
  'based',
  'various',
  'economic',
  'indicators'],
 ['exploring', 'new', 'cuisines', 'exciting', 'culinary', 'adventure'],
 ['machine', 'learning', 'algorithms', 'learn', 'data', 'make', 'predictions']]

In [12]:
model = Word2Vec(sentences=preprocessed_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

<!-- vector_size=100
Description: This sets the number of dimensions (features) of the word vectors.

Default: 100

Higher values: More expressive vectors but require more training data and time.

Lower values: Less detailed, but faster to train and require less memory.

3. window=5
Description: The maximum distance between the current word and the predicted context words.

Example: If the window is 5, then the model considers up to 5 words to the left and right of the target word.

4. min_count=1
Description: Ignores all words with total frequency lower than this.

Use case: If set to 1, all words are included (even rare ones).

Higher value: Removes infrequent words (helps reduce noise and memory use).

5. workers=4
Description: Number of worker threads to train the model (parallel training).

More workers: Faster training on multi-core systems.

Recommended: Set this to the number of cores in your CPU for better performance.

6. sg=1
Description: Defines the training algorithm.

sg=0: CBOW (Continuous Bag of Words)

sg=1: Skip-gram

Skip-gram (sg=1) is better for infrequent words.

CBOW (sg=0) is faster and works well for frequent words. -->

In [13]:
try: 
    similarity_new_cuisines = model.wv.similarity("new", "cuisines")
    print(f"similarity between them : {similarity_new_cuisines}")
except KeyError as e:
    print(f"key error :{e}")
    
    

similarity between them : 0.11011417955160141


In [27]:
model.wv.most_similar('deep')


[('combines', 0.31906330585479736),
 ('fox', 0.1888551265001297),
 ('recognition', 0.16205263137817383),
 ('park', 0.14646083116531372),
 ('dog', 0.1276959627866745),
 ('learn', 0.12720921635627747),
 ('independent', 0.12247155606746674),
 ('adventure', 0.12195171415805817),
 ('statistics', 0.11074147373437881),
 ('lazy', 0.110305555164814)]

In [29]:
model.wv['deep']


array([ 5.6267120e-03,  5.4973708e-03,  1.8291199e-03,  5.7494068e-03,
       -8.9680776e-03,  6.5593575e-03,  9.2259916e-03, -4.2071473e-03,
        1.6075504e-03, -5.2338815e-03,  1.0582185e-03,  2.7701687e-03,
        8.1607364e-03,  5.4401276e-04,  2.5570584e-03,  1.2977350e-03,
        8.4025227e-03, -5.7077026e-03, -6.2618302e-03, -3.6275184e-03,
       -2.3005498e-03,  5.0410628e-03, -8.1203571e-03, -2.8335357e-03,
       -8.1974268e-03,  5.1497100e-03, -2.5680638e-03, -9.0671070e-03,
        4.0717293e-03,  9.0173231e-03, -3.0376601e-03, -5.8385395e-03,
        3.0198884e-03, -4.3584823e-04, -9.9794362e-03,  8.4177041e-03,
       -7.3388875e-03, -4.9304068e-03, -2.6570810e-03, -5.4523144e-03,
        1.7165100e-03,  9.7128144e-03,  4.5722723e-03,  8.0886027e-03,
       -4.7045827e-04,  6.4492342e-04, -2.6683521e-03, -8.7795611e-03,
        3.4313034e-03,  2.0933736e-03, -9.4218543e-03, -4.9684369e-03,
       -9.7340988e-03, -5.7197916e-03,  4.0645422e-03,  8.6428607e-03,
      