In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

corpus = [
    "the quick brown foxes",
    "the lazy dog",
    "foxes jump over the dog"
]

# Count Occurrence
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(corpus)

# Normalized Count Occurrence (L1 Normalization)
X_normalized = normalize(X_counts, norm='l1', axis=1)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("Counts:\n", X_counts.toarray())
print("Normalized:\n", X_normalized.toarray())

Vocabulary: ['brown' 'dog' 'foxes' 'jump' 'lazy' 'over' 'quick' 'the']
Counts:
 [[1 0 1 0 0 0 1 1]
 [0 1 0 0 1 0 0 1]
 [0 1 1 1 0 1 0 1]]
Normalized:
 [[0.25       0.         0.25       0.         0.         0.
  0.25       0.25      ]
 [0.         0.33333333 0.         0.         0.33333333 0.
  0.         0.33333333]
 [0.         0.2        0.2        0.2        0.         0.2
  0.         0.2       ]]


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(corpus)

print("TF-IDF Matrix:\n", X_tfidf.toarray())

TF-IDF Matrix:
 [[0.5844829  0.         0.44451431 0.         0.         0.
  0.5844829  0.34520502]
 [0.         0.54783215 0.         0.         0.72033345 0.
  0.         0.42544054]
 [0.         0.40619178 0.40619178 0.53409337 0.         0.53409337
  0.         0.31544415]]


In [9]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [12]:
import nltk
nltk.download('punkt_tab')
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Prepare data (Word2Vec requires a list of tokenized sentences)
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

# Train Word2Vec model
# vector_size: dimensionality of the word vectors
# window: maximum distance between current and predicted word
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get vector for a specific word
vector = model.wv['foxes']
print("Vector for 'foxes':\n", vector)

# Find most similar words
similar = model.wv.most_similar('foxes')
print("Most similar to 'foxes':", similar)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Vector for 'foxes':
 [ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-03
 -7