# NLP Assignment 2 - To perform bag-of-words approach and create word embedding for text data

In [1]:
!pip install gensim nltk scikit-learn

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
sentences = [
    "This is an example of natural language processing",
    "Tokenization is the process of breaking down words into different tokens",
    "Tokenization is essential for creating embeddings and understanding the context for the computer"
]

# 1. Count Vectorizer (Bag Of Words Approach)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(sentences)

print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

print("\nBag of Words (Count Matrix):")
print(bow_counts.toarray())

Vocabulary:
['an' 'and' 'breaking' 'computer' 'context' 'creating' 'different' 'down'
 'embeddings' 'essential' 'example' 'for' 'into' 'is' 'language' 'natural'
 'of' 'process' 'processing' 'the' 'this' 'tokenization' 'tokens'
 'understanding' 'words']

Bag of Words (Count Matrix):
[[1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0]
 [0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 1 0 1]
 [0 1 0 1 1 1 0 0 1 1 0 2 0 1 0 0 0 0 0 2 0 1 0 1 0]]


# 2. Normalized Bag of Words Matrix

In [6]:
bow_array = bow_counts.toarray().astype(float)

normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("Normalized Bag of Words:")
print(normalized_bow)

Normalized Bag of Words:
[[0.125      0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.125      0.
  0.         0.125      0.125      0.125      0.125      0.
  0.125      0.         0.125      0.         0.         0.
  0.        ]
 [0.         0.         0.09090909 0.         0.         0.
  0.09090909 0.09090909 0.         0.         0.         0.
  0.09090909 0.09090909 0.         0.         0.09090909 0.09090909
  0.         0.09090909 0.         0.09090909 0.09090909 0.
  0.09090909]
 [0.         0.07692308 0.         0.07692308 0.07692308 0.07692308
  0.         0.         0.07692308 0.07692308 0.         0.15384615
  0.         0.07692308 0.         0.         0.         0.
  0.         0.15384615 0.         0.07692308 0.         0.07692308
  0.        ]]


# 3. TF-IDF Vectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

print("TF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())

TF-IDF Vocabulary:
['an' 'and' 'breaking' 'computer' 'context' 'creating' 'different' 'down'
 'embeddings' 'essential' 'example' 'for' 'into' 'is' 'language' 'natural'
 'of' 'process' 'processing' 'the' 'this' 'tokenization' 'tokens'
 'understanding' 'words']

TF-IDF Matrix:
[[0.37994462 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.37994462 0.
  0.         0.22440141 0.37994462 0.37994462 0.28895767 0.
  0.37994462 0.         0.37994462 0.         0.         0.
  0.        ]
 [0.         0.         0.33178811 0.         0.         0.
  0.33178811 0.33178811 0.         0.         0.         0.
  0.33178811 0.1959594  0.         0.         0.25233341 0.33178811
  0.         0.25233341 0.         0.25233341 0.33178811 0.
  0.33178811]
 [0.         0.2649918  0.         0.2649918  0.2649918  0.2649918
  0.         0.         0.2649918  0.2649918  0.         0.52998359
  0.         0.15650842 0.         0.         0.         0.
  0.         

# 4. Tokenization using nltk - punkt tab

In [9]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

print(tokenized_sentences)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[['this', 'is', 'an', 'example', 'of', 'natural', 'language', 'processing'], ['tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'words', 'into', 'different', 'tokens'], ['tokenization', 'is', 'essential', 'for', 'creating', 'embeddings', 'and', 'understanding', 'the', 'context', 'for', 'the', 'computer']]


# 5. Training Word-2-Vec Model

In [10]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [11]:
print("Embedding for word 'language':")
print(w2v_model.wv['language'])

Embedding for word 'language':
[ 0.00257044  0.00084246 -0.00254006  0.00936029  0.00275908  0.00409472
 -0.0011834   0.0009081   0.00662517 -0.00072785  0.00334232 -0.00067519
  0.00524809  0.00364064  0.00257861 -0.00530791 -0.00471027  0.00430493
 -0.00590659 -0.00018213 -0.00063406  0.00349599 -0.00844565  0.008818
 -0.00144616 -0.00533231  0.0040541  -0.00193368 -0.00776703 -0.00449852
 -0.00038823 -0.00894646  0.00057242  0.00244541 -0.00322876  0.00257358
  0.00248184  0.00998698  0.00143067  0.00201974  0.00277918 -0.0020759
 -0.00870117  0.00802288 -0.00197471 -0.00969043 -0.00655291 -0.00394493
  0.00395548  0.00504247  0.00608555 -0.00676943  0.00068828 -0.00277378
 -0.00520646  0.0069785   0.00394911 -0.0031087  -0.00827575 -0.00514036
 -0.00065048  0.00781425  0.00604725 -0.00845304 -0.00956594  0.00713241
 -0.00233162 -0.00369382  0.00574952 -0.00584621  0.0050972  -0.0002393
 -0.0068743  -0.00033539  0.0063573   0.00929013  0.00222292  0.00504594
 -0.0049726  -0.0007973 

# 6. Sentence Embeddings

In [13]:
def sentence_embedding(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0)

print("Sentence Embedding:")
print(sentence_embedding(sentences[0], w2v_model))

Sentence Embedding:
[-1.7456166e-03  1.8554007e-03  2.5204441e-03  2.9984203e-03
 -2.6371318e-03  3.5544392e-05  5.1276397e-04  1.4629253e-03
 -5.5768737e-04 -2.1241603e-03 -1.0773984e-03 -1.5179085e-04
  1.6350774e-03  2.7995061e-03  1.1911661e-03  1.2271837e-03
  2.2056834e-03  1.1445445e-03 -6.3199922e-04 -1.4179916e-03
  7.3126139e-05 -3.3079641e-04 -1.9750257e-03  1.3939199e-03
 -1.5637996e-03  2.6480285e-03 -1.7499253e-03  3.4582103e-04
  9.6437754e-04  6.4160209e-04  4.0156916e-03 -1.1038890e-03
 -5.6187366e-04 -3.4021945e-03  1.2099890e-03 -1.0483969e-03
  2.7857623e-03 -6.0807954e-04 -4.9243827e-04  2.8416496e-03
  2.8031536e-03 -8.3858310e-04 -1.5831259e-03  4.4402189e-04
 -3.4977711e-04 -4.0636589e-03  8.0804812e-04 -2.3284815e-03
  1.0986961e-03 -8.3142734e-04  4.2281114e-03 -5.8895443e-04
 -1.0770948e-03  1.2687740e-03 -9.7300851e-04  4.0142341e-03
  2.2083516e-03  9.7076758e-04  4.7949149e-04 -9.7748148e-04
 -1.5888590e-03  1.1717086e-04 -4.1859673e-04  1.3065455e-03
 -1.