Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on 
data. Create embeddings using Word2Vec 

1. Install & Import Required Libraries

In [None]:
!pip install nltk scikit-learn gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Downloading gensim-4.4.0-cp312-cp312-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   --- ------------------------------------ 2.1/24.4 MB 10.7 MB/s eta 0:00:03
   -------- ------------------------------- 5.2/24.4 MB 12.8 MB/s eta 0:00:02
   --------------- ------------------------ 9.4/24.4 MB 15.5 MB/s eta 0:00:01
   -------------------- ------------------- 12.6/24.4 MB 15.2 MB/s eta 0:00:01
   -------------------------- ------------- 16.0/24.4 MB 15.3 MB/s eta 0:00:01
   ------------------------------- -------- 19.4/24.4 MB 15.5 MB/s eta 0:00:01
   -------------------------------------- - 23.3/24.4 MB 15.5 MB/s eta 0:00:01
   ---------------------------------------  24.4/24.4 MB 15.8 MB/s eta 0:00:01
   ---------------------------------------  24.4/24.4 MB 15.8 MB/s eta 0:00:01
   ---------------------------------------- 24.4/24.4 MB 12.6 MB/s  0:00


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

2. Sample Dataset

In [4]:
documents = [
    "AI is changing the world",
    "Machine learning is a part of AI",
    "Deep learning powers modern AI"
]


3. Bag of Words – Count Occurrence

In [5]:
count_vectorizer = CountVectorizer()
bow_count = count_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

print("\nBag of Words (Count Occurrence):")
print(bow_count.toarray())


Vocabulary:
['ai' 'changing' 'deep' 'is' 'learning' 'machine' 'modern' 'of' 'part'
 'powers' 'the' 'world']

Bag of Words (Count Occurrence):
[[1 1 0 1 0 0 0 0 0 0 1 1]
 [1 0 0 1 1 1 0 1 1 0 0 0]
 [1 0 1 0 1 0 1 0 0 1 0 0]]


4. Bag of Words – Normalized Count Occurrence

In [6]:
bow_array = bow_count.toarray()

# Normalize by total words in each document
normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("\nBag of Words (Normalized Count):")
print(normalized_bow)



Bag of Words (Normalized Count):
[[0.2        0.2        0.         0.2        0.         0.
  0.         0.         0.         0.         0.2        0.2       ]
 [0.16666667 0.         0.         0.16666667 0.16666667 0.16666667
  0.         0.16666667 0.16666667 0.         0.         0.        ]
 [0.2        0.         0.2        0.         0.2        0.
  0.2        0.         0.         0.2        0.         0.        ]]


TF-IDF (Term Frequency – Inverse Document Frequency)

In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("TF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

TF-IDF Vocabulary:
['ai' 'changing' 'deep' 'is' 'learning' 'machine' 'modern' 'of' 'part'
 'powers' 'the' 'world']

TF-IDF Matrix:
[[0.29803159 0.50461134 0.         0.38376993 0.         0.
  0.         0.         0.         0.         0.50461134 0.50461134]
 [0.27824521 0.         0.         0.35829137 0.35829137 0.4711101
  0.         0.4711101  0.4711101  0.         0.         0.        ]
 [0.29803159 0.         0.50461134 0.         0.38376993 0.
  0.50461134 0.         0.         0.50461134 0.         0.        ]]


6. Word2Vec Embeddings

In [8]:
#Tokenize Sentences
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]
print(tokenized_docs)


[['ai', 'is', 'changing', 'the', 'world'], ['machine', 'learning', 'is', 'a', 'part', 'of', 'ai'], ['deep', 'learning', 'powers', 'modern', 'ai']]


In [9]:
#Train Word2Vec Model
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4
)


In [10]:
#Get Word Embeddings
print("\nWord2Vec Embedding for 'ai':")
print(w2v_model.wv['ai'])



Word2Vec Embedding for 'ai':
[-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]


In [11]:
#Similar Words
print("\nWords similar to 'learning':")
print(w2v_model.wv.most_similar('learning'))



Words similar to 'learning':
[('powers', 0.12486250698566437), ('changing', 0.08061248809099197), ('of', 0.07399576157331467), ('ai', 0.04237300902605057), ('deep', 0.018277151510119438), ('is', 0.011071980930864811), ('machine', 0.0013571369927376509), ('the', -0.1094222441315651), ('a', -0.11910455673933029), ('modern', -0.17424818873405457)]
