In [1]:
documents = [
    "I love machine learning",
    "Machine learning is fun",
    "I love NLP",
    "NLP is a part of machine learning"
]


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
count_vectorizer = CountVectorizer()

# Fit and transform the documents
bow_counts = count_vectorizer.fit_transform(documents)

# Convert to array
bow_array = bow_counts.toarray()

# Vocabulary
print("Vocabulary:", count_vectorizer.get_feature_names_out())

print("\nBag of Words - Count Occurrence:") #counts how many times a word occur in document
print(bow_array)


Vocabulary: ['fun' 'is' 'learning' 'love' 'machine' 'nlp' 'of' 'part']

Bag of Words - Count Occurrence:
[[0 0 1 1 1 0 0 0]
 [1 1 1 0 1 0 0 0]
 [0 0 0 1 0 1 0 0]
 [0 1 1 0 1 1 1 1]]


In [3]:
from sklearn.preprocessing import normalize

# Normalize word counts: Converts raw counts into relative frequencies
bow_normalized = normalize(bow_array, norm='l1')

print("\nBag of Words - Normalized Count:")
print(bow_normalized)



Bag of Words - Normalized Count:
[[0.         0.         0.33333333 0.33333333 0.33333333 0.
  0.         0.        ]
 [0.25       0.25       0.25       0.         0.25       0.
  0.         0.        ]
 [0.         0.         0.         0.5        0.         0.5
  0.         0.        ]
 [0.         0.16666667 0.16666667 0.         0.16666667 0.16666667
  0.16666667 0.16666667]]


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("\nTF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())



TF-IDF Vocabulary:
['fun' 'is' 'learning' 'love' 'machine' 'nlp' 'of' 'part']

TF-IDF Matrix:
[[0.         0.         0.53256952 0.65782931 0.53256952 0.
  0.         0.        ]
 [0.64065543 0.5051001  0.40892206 0.         0.40892206 0.
  0.         0.        ]
 [0.         0.         0.         0.70710678 0.         0.70710678
  0.         0.        ]
 [0.         0.39137817 0.31685436 0.         0.31685436 0.39137817
  0.49641358 0.49641358]]


In [6]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [8]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download tokenizer
nltk.download('punkt')

# Tokenize sentences
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Get vector for a word
print("\nWord2Vec embedding for word 'machine':")
print(word2vec_model.wv['machine'])



Word2Vec embedding for word 'machine':
[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
