In [1]:
!pip install nltk gensim scikit-learn


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
!pip install scikit-learn gensim




In [7]:
corpus = [
    "NLTK is a powerful NLP library",
    "I am learning text representation techniques",
    "Bag of Words TF IDF and Word2Vec are important"
]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bow = count_vectorizer.fit_transform(corpus)

print("Bag-of-Words (Count Occurrence):")
print(bow.toarray())
print("Vocabulary:", count_vectorizer.get_feature_names_out())


Bag-of-Words (Count Occurrence):
[[0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0]
 [0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1]]
Vocabulary: ['am' 'and' 'are' 'bag' 'idf' 'important' 'is' 'learning' 'library' 'nlp'
 'nltk' 'of' 'powerful' 'representation' 'techniques' 'text' 'tf'
 'word2vec' 'words']


In [9]:
import numpy as np

bow_array = bow.toarray().astype(float)
normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("\nNormalized Bag-of-Words:")
print(np.round(normalized_bow, 3))



Normalized Bag-of-Words:
[[0.    0.    0.    0.    0.    0.    0.2   0.    0.2   0.2   0.2   0.
  0.2   0.    0.    0.    0.    0.    0.   ]
 [0.2   0.    0.    0.    0.    0.    0.    0.2   0.    0.    0.    0.
  0.    0.2   0.2   0.2   0.    0.    0.   ]
 [0.    0.111 0.111 0.111 0.111 0.111 0.    0.    0.    0.    0.    0.111
  0.    0.    0.    0.    0.111 0.111 0.111]]


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(corpus)

print("\nTF-IDF Matrix:")
print(np.round(tfidf.toarray(), 3))
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())



TF-IDF Matrix:
[[0.    0.    0.    0.    0.    0.    0.447 0.    0.447 0.447 0.447 0.
  0.447 0.    0.    0.    0.    0.    0.   ]
 [0.447 0.    0.    0.    0.    0.    0.    0.447 0.    0.    0.    0.
  0.    0.447 0.447 0.447 0.    0.    0.   ]
 [0.    0.333 0.333 0.333 0.333 0.333 0.    0.    0.    0.    0.    0.333
  0.    0.    0.    0.    0.333 0.333 0.333]]
Vocabulary: ['am' 'and' 'are' 'bag' 'idf' 'important' 'is' 'learning' 'library' 'nlp'
 'nltk' 'of' 'powerful' 'representation' 'techniques' 'text' 'tf'
 'word2vec' 'words']


In [11]:
from gensim.models import Word2Vec

# Manual tokenization (simple & safe)
tokenized_corpus = [doc.lower().split() for doc in corpus]

word2vec_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4
)

print("\nWord2Vec Embedding for 'nlp':")
print(word2vec_model.wv['nlp'])

print("\nMost similar words to 'nlp':")
print(word2vec_model.wv.most_similar('nlp'))



Word2Vec Embedding for 'nlp':
[-0.01915722  0.01788826  0.00832905  0.01847278  0.01328937  0.00584818
  0.01961174 -0.00885091 -0.01360808  0.00845425  0.00745706 -0.01132975
  0.01941221 -0.00711909  0.01910025  0.00166836 -0.01267625 -0.00395654
 -0.01475527 -0.00595841  0.002083    0.01896731  0.01871257 -0.01319258
  0.00695181  0.00455113 -0.00498086 -0.01845914  0.00205392 -0.0163314
  0.01264296 -0.01160119  0.01107371  0.0196688  -0.00032035  0.00905833
 -0.00362015  0.014721    0.00787927 -0.01802202 -0.00479789  0.00725905
 -0.0001975  -0.00240403 -0.00211069 -0.00334339  0.00121038  0.00833191
 -0.00850722 -0.00766713]

Most similar words to 'nlp':
[('are', 0.22979341447353363), ('text', 0.2258988618850708), ('is', 0.16492752730846405), ('idf', 0.1274467259645462), ('word2vec', 0.10518769174814224), ('library', 0.09495434910058975), ('important', 0.05936971306800842), ('powerful', 0.05487537756562233), ('representation', 0.051359690725803375), ('nltk', 0.050663236528635025