In [5]:
!pip install nltk scikit-learn gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [6]:
import nltk
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
documents = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love AI and data science",
    "Data science uses machine learning"
]

In [10]:
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

processed_docs = []

for doc in documents:
    tokens = word_tokenize(doc.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    processed_docs.append(" ".join(filtered))

print(processed_docs)

['love machine learning', 'machine learning amazing', 'love ai data science', 'data science uses machine learning']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [11]:
count_vectorizer = CountVectorizer()

bow_matrix = count_vectorizer.fit_transform(processed_docs)

print("Vocabulary:", count_vectorizer.get_feature_names_out())
print("\nBoW Matrix:\n", bow_matrix.toarray())

Vocabulary: ['ai' 'amazing' 'data' 'learning' 'love' 'machine' 'science' 'uses']

BoW Matrix:
 [[0 0 0 1 1 1 0 0]
 [0 1 0 1 0 1 0 0]
 [1 0 1 0 1 0 1 0]
 [0 0 1 1 0 1 1 1]]


In [12]:
bow_array = bow_matrix.toarray()

normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("Normalized BoW:\n", normalized_bow)

Normalized BoW:
 [[0.         0.         0.         0.33333333 0.33333333 0.33333333
  0.         0.        ]
 [0.         0.33333333 0.         0.33333333 0.         0.33333333
  0.         0.        ]
 [0.25       0.         0.25       0.         0.25       0.
  0.25       0.        ]
 [0.         0.         0.2        0.2        0.         0.2
  0.2        0.2       ]]


In [13]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(processed_docs)

print("TF-IDF Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())

TF-IDF Vocabulary: ['ai' 'amazing' 'data' 'learning' 'love' 'machine' 'science' 'uses']

TF-IDF Matrix:
 [[0.         0.         0.         0.53256952 0.65782931 0.53256952
  0.         0.        ]
 [0.         0.74230628 0.         0.47380449 0.         0.47380449
  0.         0.        ]
 [0.59081908 0.         0.46580855 0.         0.46580855 0.
  0.46580855 0.        ]
 [0.         0.         0.45085176 0.36500336 0.         0.36500336
  0.45085176 0.57184829]]


In [14]:
tokenized_docs = [doc.split() for doc in processed_docs]
print(tokenized_docs)

[['love', 'machine', 'learning'], ['machine', 'learning', 'amazing'], ['love', 'ai', 'data', 'science'], ['data', 'science', 'uses', 'machine', 'learning']]


In [15]:
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [16]:
print("Vector for 'machine':\n", w2v_model.wv['machine'])

Vector for 'machine':
 [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-03
 

In [17]:
print(w2v_model.wv.most_similar('machine'))

[('data', 0.06797593832015991), ('uses', 0.009391162544488907), ('love', 0.0045030261389911175), ('learning', -0.010839177295565605), ('science', -0.023671656847000122), ('ai', -0.11410722136497498), ('amazing', -0.11555545777082443)]
