In [2]:
# Ignore  the warnings
import warnings

warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

style.use("fivethirtyeight")
sns.set_theme(style="whitegrid", color_codes=True)

import nltk

In [3]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
# stop-words
from nltk.corpus import stopwords

stop_words = set(nltk.corpus.stopwords.words("english"))


# tokenizing
from nltk import word_tokenize, sent_tokenize

import tensorflow as tf

2024-10-04 02:39:42.309672: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-04 02:39:42.309748: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-04 02:39:42.341172: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-04 02:39:42.425184: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input
from tensorflow.keras.models import Model

In [7]:
sample_text_1 = "bitty bought a bit of butter"
sample_text_2 = "but the bit of butter was a bit bitter"
sample_text_3 = "so she bought some better butter to make the bitter butter better"

corp = [sample_text_1, sample_text_2, sample_text_3]
no_docs = len(corp)

In [8]:
# INTEGER ENCODING ALL THE DOCUMENTS

vocab_size = 50  #  the vocab_size is specified large enough so as to ensure unique integer encoding for each and every word.
encode_corp = []

for i, doc in enumerate(corp):
    encode_corp.append(one_hot(doc, 50))
    print("The encoding for document", i + 1, " is : ", one_hot(doc, 50))

The encoding for document 1  is :  [1, 48, 1, 12, 26, 43]
The encoding for document 2  is :  [13, 4, 12, 26, 43, 15, 1, 12, 45]
The encoding for document 3  is :  [45, 21, 48, 20, 20, 43, 43, 15, 4, 45, 43, 20]


In [11]:
# PADDING THE DOCS (to make very doc of same length)

# The Keras Embedding layer requires all individual documents to be of same length
# Hence we wil pad the shorter documents with 0 for now.
# Therefore now in Keras Embedding layer the 'input_length' will be equal to the length (ie no of words) of the document
# with maximum length or maximum number of words.
# To pad the shorter documents I am using pad_sequences functon from the Keras library.


maxlen = -1
for doc in corp:
    tokens = nltk.word_tokenize(doc)
    print(tokens)
    if maxlen < len(tokens):
        maxlen = len(tokens)

print("The Maximum number of words in any document is : ", maxlen)

['bitty', 'bought', 'a', 'bit', 'of', 'butter']
['but', 'the', 'bit', 'of', 'butter', 'was', 'a', 'bit', 'bitter']
['so', 'she', 'bought', 'some', 'better', 'butter', 'to', 'make', 'the', 'bitter', 'butter', 'better']
The Maximum number of words in any document is :  12


In [15]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp = pad_sequences(encode_corp, maxlen=maxlen, padding="post", value=0.0)
print("No of padded documents: ", len(pad_corp))
for i, doc in enumerate(pad_corp):
    print("The padded encoding for document", i + 1, " is : ", doc)

No of padded documents:  3
The padded encoding for document 1  is :  [ 1 48  1 12 26 43  0  0  0  0  0  0]
The padded encoding for document 2  is :  [13  4 12 26 43 15  1 12 45  0  0  0]
The padded encoding for document 3  is :  [45 21 48 20 20 43 43 15  4 45 43 20]


In [16]:
# Now all the documents are of same length (after padding). And so now we are ready to create and use the embeddings.
inputs = Input(shape=(no_docs, maxlen), dtype="float32")

In [18]:
"""
shape of input. 
each document has 12 element or words which is the value of our maxlen variable.
"""

word_input = Input(shape=(maxlen,), dtype="float32")

word_embedding = Embedding(input_dim=vocab_size, output_dim=8, input_length=maxlen)(
    word_input
)

word_vec = Flatten()(word_embedding)

embed_model = Model([word_input], word_vec)

2024-10-04 02:58:22.655492: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-04 02:58:22.799111: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-04 02:58:22.799359: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-04 02:58:22.801139: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-04 02:58:22.801385: I external/local_xla/xla/stream_executor

PARAMETERS OF THE EMBEDDING LAYER ---

'input_dim' = the vocab size that we will choose. In other words it is the number of unique words in the vocab.

'output_dim' = the number of dimensions we wish to embed into. Each word will be represented by a vector of this much dimensions.

'input_length' = lenght of the maximum document. which is stored in maxlen variable in our case.

In [19]:
embed_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

In [20]:
print(type(word_embedding))
print(word_embedding)

<class 'keras.src.engine.keras_tensor.KerasTensor'>
KerasTensor(type_spec=TensorSpec(shape=(None, 12, 8), dtype=tf.float32, name=None), name='embedding/embedding_lookup/Identity:0', description="created by layer 'embedding'")


In [21]:
print(embed_model.summary())  # summary of the model

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 12)]              0         
                                                                 
 embedding (Embedding)       (None, 12, 8)             400       
                                                                 
 flatten (Flatten)           (None, 96)                0         
                                                                 
Total params: 400 (1.56 KB)
Trainable params: 400 (1.56 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [22]:
embeddings = embed_model.predict(pad_corp)  # finally getting the embeddings.



In [23]:
print("Shape of embeddings : ", embeddings.shape)
print(embeddings)

Shape of embeddings :  (3, 96)
[[ 0.03554154 -0.03450155  0.00236611  0.02651289  0.0381381   0.02427639
   0.03229351 -0.01415057 -0.01682829  0.00953909  0.04553615  0.02884376
  -0.00994335  0.04756803 -0.0150812  -0.0140302   0.03554154 -0.03450155
   0.00236611  0.02651289  0.0381381   0.02427639  0.03229351 -0.01415057
   0.04744269  0.01025765 -0.00566924 -0.00685807 -0.02889766  0.03916797
   0.02353359 -0.03656213 -0.02294981 -0.00203184 -0.00591069 -0.02385206
   0.03165257  0.02598954 -0.00122378 -0.02945955  0.00985926 -0.03045031
  -0.00322922 -0.02833431  0.00508649  0.0366048   0.04135503  0.00198681
   0.01818998  0.00220318  0.03916449  0.01319063  0.04604201  0.02142968
   0.01267872 -0.03109838  0.01818998  0.00220318  0.03916449  0.01319063
   0.04604201  0.02142968  0.01267872 -0.03109838  0.01818998  0.00220318
   0.03916449  0.01319063  0.04604201  0.02142968  0.01267872 -0.03109838
   0.01818998  0.00220318  0.03916449  0.01319063  0.04604201  0.02142968
   0.01

In [25]:
embeddings = embeddings.reshape(-1, maxlen, 8)
print("Shape of embeddings : ", embeddings.shape)
print(embeddings)

# The resulting shape is (3,12,8).

# 3---> no of documents

# 12---> each document is made of 12 words which was our maximum length of any document.

# & 8---> each word is 8 dimensional.

Shape of embeddings :  (3, 12, 8)
[[[ 0.03554154 -0.03450155  0.00236611  0.02651289  0.0381381
    0.02427639  0.03229351 -0.01415057]
  [-0.01682829  0.00953909  0.04553615  0.02884376 -0.00994335
    0.04756803 -0.0150812  -0.0140302 ]
  [ 0.03554154 -0.03450155  0.00236611  0.02651289  0.0381381
    0.02427639  0.03229351 -0.01415057]
  [ 0.04744269  0.01025765 -0.00566924 -0.00685807 -0.02889766
    0.03916797  0.02353359 -0.03656213]
  [-0.02294981 -0.00203184 -0.00591069 -0.02385206  0.03165257
    0.02598954 -0.00122378 -0.02945955]
  [ 0.00985926 -0.03045031 -0.00322922 -0.02833431  0.00508649
    0.0366048   0.04135503  0.00198681]
  [ 0.01818998  0.00220318  0.03916449  0.01319063  0.04604201
    0.02142968  0.01267872 -0.03109838]
  [ 0.01818998  0.00220318  0.03916449  0.01319063  0.04604201
    0.02142968  0.01267872 -0.03109838]
  [ 0.01818998  0.00220318  0.03916449  0.01319063  0.04604201
    0.02142968  0.01267872 -0.03109838]
  [ 0.01818998  0.00220318  0.03916449  0

In [26]:
for i, doc in enumerate(embeddings):
    for j, word in enumerate(doc):
        print(
            "The encoding for ",
            j + 1,
            "th word",
            "in",
            i + 1,
            "th document is : \n\n",
            word,
        )

# Now this makes it easier to visualize that we have 3(size of corp) documents with each consisting of 12(maxlen) words
# and each word mapped to a 8-dimensional vector

The encoding for  1 th word in 1 th document is : 

 [ 0.03554154 -0.03450155  0.00236611  0.02651289  0.0381381   0.02427639
  0.03229351 -0.01415057]
The encoding for  2 th word in 1 th document is : 

 [-0.01682829  0.00953909  0.04553615  0.02884376 -0.00994335  0.04756803
 -0.0150812  -0.0140302 ]
The encoding for  3 th word in 1 th document is : 

 [ 0.03554154 -0.03450155  0.00236611  0.02651289  0.0381381   0.02427639
  0.03229351 -0.01415057]
The encoding for  4 th word in 1 th document is : 

 [ 0.04744269  0.01025765 -0.00566924 -0.00685807 -0.02889766  0.03916797
  0.02353359 -0.03656213]
The encoding for  5 th word in 1 th document is : 

 [-0.02294981 -0.00203184 -0.00591069 -0.02385206  0.03165257  0.02598954
 -0.00122378 -0.02945955]
The encoding for  6 th word in 1 th document is : 

 [ 0.00985926 -0.03045031 -0.00322922 -0.02833431  0.00508649  0.0366048
  0.04135503  0.00198681]
The encoding for  7 th word in 1 th document is : 

 [ 0.01818998  0.00220318  0.03916449