In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/computer/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/computer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
text = """
Machine learning (ML) is a field of study in artificial intelligence concerned with the 
development and study of statistical algorithms that can learn from data and generalise 
to unseen data, and thus perform tasks without explicit instructions. Within a subdiscipline 
in machine learning, advances in the field of deep learning have allowed neural networks, a 
class of statistical algorithms, to surpass many previous machine learning approaches in 
performance.ML finds application in many fields, including natural language processing, 
computer vision, speech recognition, email filtering, agriculture, and medicine. The 
application of ML to business problems is known as predictive analytics. Statistics and 
mathematical optimisation (mathematical programming) methods comprise the foundations of 
machine learning. Data mining is a related field of study, focusing on exploratory data 
analysis (EDA) via unsupervised learning. From a theoretical viewpoint, probably 
approximately correct learning provides a framework for describing machine learning. 
"""

## Deleting stopwords and splitting into tokens

In [8]:
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
sentences = [filtered_tokens]

In [9]:
sentences

[['machine',
  'learning',
  'ml',
  'field',
  'study',
  'artificial',
  'intelligence',
  'concerned',
  'development',
  'study',
  'statistical',
  'algorithms',
  'learn',
  'data',
  'generalise',
  'unseen',
  'data',
  'thus',
  'perform',
  'tasks',
  'without',
  'explicit',
  'instructions',
  'within',
  'subdiscipline',
  'machine',
  'learning',
  'advances',
  'field',
  'deep',
  'learning',
  'allowed',
  'neural',
  'networks',
  'class',
  'statistical',
  'algorithms',
  'surpass',
  'many',
  'previous',
  'machine',
  'learning',
  'approaches',
  'finds',
  'application',
  'many',
  'fields',
  'including',
  'natural',
  'language',
  'processing',
  'computer',
  'vision',
  'speech',
  'recognition',
  'email',
  'filtering',
  'agriculture',
  'medicine',
  'application',
  'ml',
  'business',
  'problems',
  'known',
  'predictive',
  'analytics',
  'statistics',
  'mathematical',
  'optimisation',
  'mathematical',
  'programming',
  'methods',
  'compris

## Calculating vector interpretaion of word

In [10]:
model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)
print("Vector for word 'machine'", model.wv["machine"])

Vector for word 'machine' [-0.01631425  0.00901535 -0.00825099  0.00163533  0.01693982 -0.00895738
  0.00903409 -0.01351188 -0.00708498  0.01874771 -0.00311883  0.00058286
 -0.0082069  -0.01532142 -0.00301444  0.00495449 -0.00174936  0.0110826
 -0.00553676  0.00450656  0.01092675  0.01672545 -0.0028462  -0.01842331
  0.00877729  0.00115905  0.0148397  -0.00156889 -0.0052758  -0.01750296
 -0.00171949  0.00558985  0.0108261   0.01407385 -0.01143357  0.00371087
  0.0122717  -0.00959388 -0.00622889  0.01357524  0.0032589   0.00038444
  0.0069609   0.00037815  0.01927774  0.01006685 -0.01782489 -0.01407044
  0.00182221  0.01281681]


## Calculating cosine similarity between words

In [15]:
def cosine_sim(word1, word2):
    vec1 = model.wv[word1].reshape(1, -1)
    vec2 = model.wv[word2].reshape(1, -1)
    similarity = cosine_similarity(vec1, vec2)[0][0]
    return similarity

word1 = "data"
word2 = "learning"
print(f"Cosine similarity between words '{word1}' and '{word2}': {cosine_sim(word1, word2):.4f}")


Cosine similarity between words 'data' and 'learning': 0.1289


## Finding most similar words

In [23]:
similar_words = model.wv.most_similar("data", topn=5)
print("Most similar words to word 'language':")
for word, score in similar_words:
    print(f"{word}: {score:.4f}")

Most similar words to word 'language':
business: 0.3415
email: 0.3258
artificial: 0.2512
intelligence: 0.2487
agriculture: 0.2033
