## Text Representation in NLP

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Vocabulary
vocabulary = ["apple", "mango", "orange", "banana", "grape"]

# Converting words to numerical indices
data = np.array(vocabulary).reshape(-1, 1)

# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False)
one_hot = encoder.fit_transform(data)

print("Vocabulary:", vocabulary)
print("One-Hot Encoded Vectors:\n", one_hot)

Vocabulary: ['apple', 'mango', 'orange', 'banana', 'grape']
One-Hot Encoded Vectors:
 [[1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer


documents = ["The quick brown fox jumps over the lazy dog.",
            "The dog sleeps in the kernel"
             ]


# Create BoW Model
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(documents)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("Bag of Words Representation:\n", bow.toarray())

Vocabulary: ['brown' 'dog' 'fox' 'in' 'jumps' 'kernel' 'lazy' 'over' 'quick' 'sleeps'
 'the']
Bag of Words Representation:
 [[1 1 1 0 1 0 1 1 1 0 2]
 [0 1 0 1 0 1 0 0 0 1 2]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


documents = ["The quick brown fox jumps over the lazy dog.",
            "The dog sleeps in the kernel"
             ]

# Create TF-IDF Model
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Representation:\n", tfidf.toarray())

Vocabulary: ['brown' 'dog' 'fox' 'in' 'jumps' 'kernel' 'lazy' 'over' 'quick' 'sleeps'
 'the']
TF-IDF Representation:
 [[0.342369   0.24359836 0.342369   0.         0.342369   0.
  0.342369   0.342369   0.342369   0.         0.48719673]
 [0.         0.30253071 0.         0.42519636 0.         0.42519636
  0.         0.         0.         0.42519636 0.60506143]]


In [9]:

from gensim.models import Word2Vec

data = [["the", "lion", "roars"],["the", "sheep", "bleats"], ["the", "tiger", "growls"]]

# Train Word2Vec Model
word2vec_model = Word2Vec(data, vector_size=10, window=2, min_count=1, workers=4)

# Get Embedding for a Word
print("Word Embedding for 'lion':\n", word2vec_model.wv['lion'])


Word Embedding for 'lion':
 [-0.01577653  0.00321372 -0.0414063  -0.07682689 -0.01508008  0.02469795
 -0.00888027  0.05533662 -0.02742977  0.02260065]


In [10]:
import gensim.downloader as api

# Load Pretrained GloVe Model
glove_model = api.load("glove-wiki-gigaword-50")  # 50-dimensional embeddings

# Get Embedding for a Word
similar_words = glove_model.most_similar('king', topn=5)
print("Word Embedding for 'king':\n", glove_model['king'])

# Find Similar Words
similar_words = glove_model.most_similar('king', topn=5)
print("Words similar to 'king':\n", glove_model.most_similar('king'))

print("\nTop 5 words similar to 'king':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Word Embedding for 'king':
 [ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]
Words similar to 'king':
 [('prince', 0.8236179351806641), ('queen', 0.7839043140411377), ('ii', 0.7746230363845825), ('emperor', 0.7736247777938843), ('son', 0.766719400882721), ('uncle', 0.7627150416374207), ('kingdom', 0.7542160749435425), ('throne', 0.7539913654327393), ('brother', 0.7492411136627197), ('ruler', 0.7434253692626953)]

Top 5 words similar to 'king':
prince: 0.8236
queen: 0.7839
ii: 0.7746
emperor: 0.7736
son: 0.7667
