In [1]:
documents = [
    "I study machine learning",
    "Machine learning is fun",
    "I study deep learning"
]

In [2]:
# Bag of Words – Count Occurrence
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_counts = vectorizer.fit_transform(documents)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("Count Matrix:\n", bow_counts.toarray())

Vocabulary: ['deep' 'fun' 'is' 'learning' 'machine' 'study']
Count Matrix:
 [[0 0 0 1 1 1]
 [0 1 1 1 1 0]
 [1 0 0 1 0 1]]


In [3]:
# Bag of Words – Normalized Count Occurrence
import numpy as np

count_matrix = bow_counts.toarray()
normalized_bow = count_matrix / count_matrix.sum(axis=1, keepdims=True)

print("Normalized BoW:\n", normalized_bow)

Normalized BoW:
 [[0.         0.         0.         0.33333333 0.33333333 0.33333333]
 [0.         0.25       0.25       0.25       0.25       0.        ]
 [0.33333333 0.         0.         0.33333333 0.         0.33333333]]


In [4]:
# TF-IDF (Term Frequency – Inverse Document Frequency)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

Vocabulary: ['deep' 'fun' 'is' 'learning' 'machine' 'study']
TF-IDF Matrix:
 [[0.         0.         0.         0.48133417 0.61980538 0.61980538]
 [0.         0.5844829  0.5844829  0.34520502 0.44451431 0.        ]
 [0.72033345 0.         0.         0.42544054 0.         0.54783215]]


In [5]:
# Word2Vec Embeddings
tokenized_docs = [doc.lower().split() for doc in documents]

In [6]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [7]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [8]:
vector = w2v_model.wv["learning"]
print("Vector size:", len(vector))
print(vector)

Vector size: 100
[-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.508

In [9]:
import numpy as np

def document_vector(doc):
    words = doc.lower().split()
    vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(vectors, axis=0)

doc_embedding = document_vector("I study machine learning")
print("Document embedding shape:", doc_embedding.shape)

Document embedding shape: (100,)
