Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. 
Create embeddings using Word2Vec.

In [10]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [11]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Vishal
[nltk_data]     Pattar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
corpus = [
    "New Delhi is the capital city of India.",
    "Mumbai is the Financial captial of India.",
    "Early to bed, early to rise, makes a man health, wealthy and wise.",
    "Failure is the stepping stone to success or every successful person once failed.",
    "Be Honest to yourself, the world is yours",
]

In [13]:
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(corpus)

print("--- Bag of Word (Count) ---")
print("-"*100)
print("Vocabulury:\n", count_vectorizer.get_feature_names_out())
print("-"*100)
print("BoW Matrix (Count):\n", bow_matrix.toarray())

--- Bag of Word (Count) ---
----------------------------------------------------------------------------------------------------
Vocabulury:
 ['and' 'be' 'bed' 'capital' 'captial' 'city' 'delhi' 'early' 'every'
 'failed' 'failure' 'financial' 'health' 'honest' 'india' 'is' 'makes'
 'man' 'mumbai' 'new' 'of' 'once' 'or' 'person' 'rise' 'stepping' 'stone'
 'success' 'successful' 'the' 'to' 'wealthy' 'wise' 'world' 'yours'
 'yourself']
----------------------------------------------------------------------------------------------------
BoW Matrix (Count):
 [[0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 2 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 2 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1]]


In [14]:
normalized_bow = bow_matrix.toarray().astype(float)
normalized_bow /= normalized_bow.sum(axis=1, keepdims=True)

print("--- Bag of Word (Normalized Count) ---")
print("-"*100)
print("BoW Matrix (Normalized Count):\n", normalized_bow)

--- Bag of Word (Normalized Count) ---
----------------------------------------------------------------------------------------------------
BoW Matrix (Normalized Count):
 [[0.         0.         0.         0.125      0.         0.125
  0.125      0.         0.         0.         0.         0.
  0.         0.         0.125      0.125      0.         0.
  0.         0.125      0.125      0.         0.         0.
  0.         0.         0.         0.         0.         0.125
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.14285714 0.
  0.         0.         0.         0.         0.         0.14285714
  0.         0.         0.14285714 0.14285714 0.         0.
  0.14285714 0.         0.14285714 0.         0.         0.
  0.         0.         0.         0.         0.         0.14285714
  0.         0.         0.         0.         0.         0.        ]
 [0.08333333 0.         0.08333333 0.         0.         0.
  0.    

In [15]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

print("--- TF-IDF ---")
print("-"*100)
print("Vocabulury:\n", tfidf_vectorizer.get_feature_names_out())
print("-"*100)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

--- TF-IDF ---
----------------------------------------------------------------------------------------------------
Vocabulury:
 ['and' 'be' 'bed' 'capital' 'captial' 'city' 'delhi' 'early' 'every'
 'failed' 'failure' 'financial' 'health' 'honest' 'india' 'is' 'makes'
 'man' 'mumbai' 'new' 'of' 'once' 'or' 'person' 'rise' 'stepping' 'stone'
 'success' 'successful' 'the' 'to' 'wealthy' 'wise' 'world' 'yours'
 'yourself']
----------------------------------------------------------------------------------------------------
TF-IDF Matrix:
 [[0.         0.         0.         0.41042134 0.         0.41042134
  0.41042134 0.         0.         0.         0.         0.
  0.         0.         0.33112535 0.23122423 0.         0.
  0.         0.41042134 0.33112535 0.         0.         0.
  0.         0.         0.         0.         0.         0.23122423
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.45007472 0.
  0.         0

In [16]:
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

In [17]:
w2v_model = Word2Vec(sentences=tokenized_corpus, window=5, min_count=1, vector_size=100, workers=4)

In [18]:
word = "capital"
if word in w2v_model.wv:
    print(w2v_model.wv[word])
    print(w2v_model.wv.most_similar(word))
else:
    print("Word doesn't exists")

[ 1.30016683e-03 -9.80430283e-03  4.58776252e-03 -5.38222783e-04
  6.33209571e-03  1.78347470e-03 -3.12979822e-03  7.75997294e-03
  1.55466562e-03  5.52093989e-05 -4.61295387e-03 -8.45352374e-03
 -7.76683213e-03  8.67050979e-03 -8.92496016e-03  9.03471559e-03
 -9.28101782e-03 -2.76756298e-04 -1.90704700e-03 -8.93114600e-03
  8.63005966e-03  6.77781366e-03  3.01943906e-03  4.83345287e-03
  1.12190246e-04  9.42468084e-03  7.02128746e-03 -9.85372625e-03
 -4.43322072e-03 -1.29011157e-03  3.04772262e-03 -4.32395237e-03
  1.44916656e-03 -7.84589909e-03  2.77807354e-03  4.70269192e-03
  4.93731257e-03 -3.17570218e-03 -8.42704065e-03 -9.22061782e-03
 -7.22899451e-04 -7.32746487e-03 -6.81496272e-03  6.12000562e-03
  7.17230327e-03  2.11741915e-03 -7.89940078e-03 -5.69898821e-03
  8.05184525e-03  3.92084382e-03 -5.24047017e-03 -7.39190448e-03
  7.71554711e-04  3.46375466e-03  2.07919348e-03  3.10080405e-03
 -5.62050007e-03 -9.88948625e-03 -7.02083716e-03  2.30308768e-04
  4.61867917e-03  4.52630