<a href="https://colab.research.google.com/github/SAHIL9581/LIVE-AI-CLASSES/blob/main/wordembedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:
def get_dict(words):
    """
    Generate word-to-index and index-to-word dictionaries.

    Args:
        words (list of str): List of words from a tokenized corpus.

    Returns:
        tuple: (word2Ind, Ind2word) dictionaries.
    """
    unique_words = sorted(set(words))  # Sort for consistency
    word2Ind = {word: i for i, word in enumerate(unique_words)}
    Ind2word = {i: word for word, i in word2Ind.items()}

    return word2Ind, Ind2word

# Example usage
words = ["hello", "world", "hello", "machine", "learning"]
word2Ind, Ind2word = get_dict(words)

print("word2Ind:", word2Ind)
print("Ind2word:", Ind2word)


word2Ind: {'hello': 0, 'learning': 1, 'machine': 2, 'world': 3}
Ind2word: {0: 'hello', 1: 'learning', 2: 'machine', 3: 'world'}


In [3]:
# Define the tokenized version of the corpus
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

# Define V. Remember this is the size of the vocabulary
V = 5

# Get 'word2Ind' and 'Ind2word' dictionaries for the tokenized corpus
word2Ind, Ind2word = get_dict(words)


# Define first matrix of weights
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# Define second matrix of weights
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# Define first vector of biases
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# Define second vector of biases
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [4]:
# Print W1
W1

array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
       [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
       [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

In [5]:

# Print corresponding word for each index within vocabulary's range
for i in range(V):
    print(Ind2word[i])

am
because
happy
i
learning


In [6]:
# Loop through each word of the vocabulary
for word in word2Ind:
    # Extract the column corresponding to the index of the word in the vocabulary
    word_embedding_vector = W1[:, word2Ind[word]]
    # Print word alongside word embedding vector
    print(f'{word}: {word_embedding_vector}')

am: [0.41687358 0.32735501 0.26637602]
because: [ 0.08854191  0.22795148 -0.23846886]
happy: [-0.23495225 -0.23951958 -0.37770863]
i: [ 0.28320538  0.4117634  -0.11399446]
learning: [ 0.41800106 -0.23924344  0.34008124]


In [7]:
# Compute W3 as the average of W1 and W2 transposed
W3 = (W1+W2.T)/2

# Print W3
W3

array([[ 0.09752647,  0.08665397, -0.02389858,  0.1768788 ,  0.3764029 ],
       [-0.05136565,  0.15459171, -0.15029611,  0.19580601, -0.31673866],
       [ 0.19974284, -0.03063173, -0.27839106,  0.12353994, -0.04975536]])

In [8]:
# Loop through each word of the vocabulary
for word in word2Ind:
    # Extract the column corresponding to the index of the word in the vocabulary
    word_embedding_vector = W3[:, word2Ind[word]]
    # Print word alongside word embedding vector
    print(f'{word}: {word_embedding_vector}')

am: [ 0.09752647 -0.05136565  0.19974284]
because: [ 0.08665397  0.15459171 -0.03063173]
happy: [-0.02389858 -0.15029611 -0.27839106]
i: [0.1768788  0.19580601 0.12353994]
learning: [ 0.3764029  -0.31673866 -0.04975536]


In [10]:
# Define the size of the word embedding vectors and save it in the variable 'N'
N = 3

# Define V. Remember this was the size of the vocabulary in the previous lecture notebooks
V = 5


In [11]:
# Define first matrix of weights
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# Define second matrix of weights
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# Define first vector of biases
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# Define second vector of biases
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [12]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of W1: (3, 5) (NxV)
size of b1: (3, 1) (Nx1)
size of W2: (5, 3) (VxN)
size of b2: (5, 1) (Vx1)


In [13]:
# Define the tokenized version of the corpus
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

# Get 'word2Ind' and 'Ind2word' dictionaries for the tokenized corpus
word2Ind, Ind2word = get_dict(words)

# Define the 'get_windows' function as seen in a previous notebook
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

# Define the 'word_to_one_hot_vector' function as seen in a previous notebook
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

# Define the 'context_words_to_vector' function as seen in a previous notebook
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

# Define the generator function 'get_training_example' as seen in a previous notebook
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [14]:
# Save generator object in the 'training_examples' variable with the desired arguments
training_examples = get_training_example(words, 2, word2Ind, V)

In [15]:
# Get first values from generator
x_array, y_array = next(training_examples)

In [16]:
# Print context words vector
x_array

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [17]:
# Print one hot vector of center word
y_array

array([0., 0., 1., 0., 0.])

In [18]:
# Copy vector
x = x_array.copy()

# Reshape it
x.shape = (V, 1)

# Print it
print(f'x:\n{x}\n')

# Copy vector
y = y_array.copy()

# Reshape it
y.shape = (V, 1)

# Print it
print(f'y:\n{y}')

x:
[[0.25]
 [0.25]
 [0.  ]
 [0.5 ]
 [0.  ]]

y:
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


In [22]:


# Define the 'relu' function
def relu(z):
    return np.maximum(0, z)

# Define the 'softmax' function
def softmax(z):
    exp_z = np.exp(z - np.max(z))
    return exp_z / np.sum(exp_z)


In [25]:
# Compute z1 (values of first hidden layer before applying the ReLU function)
z1 = z1 = np.dot(W1, x) + b1

print("z1:", z1)

z1: [[ 0.36483875]
 [ 0.63710329]
 [-0.3236647 ]]


In [24]:
# Compute h (z1 after applying ReLU function)
h = relu(z1)

# Print h
h

array([[0.36483875],
       [0.63710329],
       [0.        ]])

In [29]:
z2 = np.dot(W2, h) + b2
print("z2:", z2)

z2: [[-0.31973737]
 [-0.28125477]
 [-0.09838369]
 [-0.33512159]
 [-0.19919612]]


In [30]:
y_hat = softmax(z2)

# Print y_hat
print("y_hat:", y_hat)

y_hat: [[0.18519074]
 [0.19245626]
 [0.23107446]
 [0.18236353]
 [0.20891502]]


In [32]:
# Print prediction
prediction = np.argmax(y_hat)

# Print prediction
print("Predicted target:", prediction)

Predicted target: 2


In [33]:
print("Target value (y):", y)

Target value (y): [[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


In [36]:
def cross_entropy_loss(y_predicted, y_actual):
    epsilon = 1e-10
    y_predicted = np.clip(y_predicted, epsilon, 1.0)  # Clip to prevent log(0)

    # Compute cross-entropy loss
    loss = -np.sum(y_actual * np.log(y_predicted))
    return loss

In [37]:
# Print value of cross entropy loss for prediction and target value
cross_entropy_loss(y_hat, y)

1.4650152923611108

In [38]:
grad_b2 = y_hat - y  # y_hat is the predicted probability distribution, y is the true one-hot encoded label

# Print the gradient vector
print("grad_b2:", grad_b2)

grad_b2: [[ 0.18519074]
 [ 0.19245626]
 [-0.76892554]
 [ 0.18236353]
 [ 0.20891502]]


In [43]:
# Compute grad_W2 using the outer product of (y_hat - y) and h (the activations of the hidden layer)
grad_W2 = np.outer(y_hat - y, h)

# Print the gradient matrix
print("grad_W2:\n", grad_W2)

grad_W2:
 [[ 0.06756476  0.11798563  0.        ]
 [ 0.0702155   0.12261452  0.        ]
 [-0.28053384 -0.48988499 -0.        ]
 [ 0.06653328  0.1161844   0.        ]
 [ 0.07622029  0.13310045  0.        ]]


In [39]:
# Compute vector with partial derivatives of loss function with respect to b1
grad_b1 = relu(np.dot(W2.T, y_hat - y))

# Print vector
print("grad_b1:", grad_b1)

grad_b1: [[0.        ]
 [0.        ]
 [0.17045858]]


In [40]:
# Compute matrix with partial derivatives of loss function with respect to W1
grad_W1 = np.dot(relu(np.dot(W2.T, y_hat - y)), x.T)

# Print matrix
print("grad_W1:\n", grad_W1)

grad_W1:
 [[0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.04261464 0.04261464 0.         0.08522929 0.        ]]


In [44]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of grad_W1: {grad_W1.shape} (NxV)')
print(f'size of grad_b1: {grad_b1.shape} (Nx1)')
print(f'size of grad_W2: {grad_W2.shape} (VxN)')
print(f'size of grad_b2: {grad_b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of grad_W1: (3, 5) (NxV)
size of grad_b1: (3, 1) (Nx1)
size of grad_W2: (5, 3) (VxN)
size of grad_b2: (5, 1) (Vx1)


In [45]:
# Define alpha
alpha = 0.03

In [50]:
# Compute updated W1
eta = 0.02 # example
W1_new = W1 - eta * grad_W1

In [48]:
print('old value of W1:')
print(W1)
print()
print('new value of W1:')
print(W1_new)

old value of W1:
[[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.26637602 -0.23846886 -0.37770863 -0.11399446  0.34008124]]

new value of W1:
[[ 0.41687358  0.08854191 -0.23495225  0.28320538  0.41800106]
 [ 0.32735501  0.22795148 -0.23951958  0.4117634  -0.23924344]
 [ 0.26594987 -0.23889501 -0.37770863 -0.11484675  0.34008124]]


In [51]:
# Update W2, b1, and b2
eta = 0.01 # example
W2_new = W2 - eta * grad_W2
b1_new = b1 - eta * grad_b1
b2_new = b2 - eta * grad_b2


print('W2_new')
print(W2_new)
print()
print('b1_new')
print(b1_new)
print()
print('b2_new')
print(b2_new)

W2_new
[[-0.22249629 -0.43126617  0.13310965]
 [ 0.08406387  0.08000579  0.1772054 ]
 [ 0.18996044 -0.05617378 -0.1790735 ]
 [ 0.06988689 -0.02131322  0.36107434]
 [ 0.33404254 -0.39556489 -0.43959196]]

b1_new
[[ 0.09688219]
 [ 0.29239497]
 [-0.27534885]]

b2_new
[[ 0.03334889]
 [-0.3658584 ]
 [-0.12006629]
 [-0.3498469 ]
 [-0.0722673 ]]


In [52]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [53]:
# 1. Implement Tokenization
text = "This is a simple example of text tokenization. Tokenization is important for NLP."
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences([text])
padded_sequences = pad_sequences(sequences, maxlen=10)

In [54]:
print("Word Index:", word_index)
print("Sequences:", sequences)
print("Padded Sequences:", padded_sequences)

Word Index: {'is': 1, 'tokenization': 2, 'this': 3, 'a': 4, 'simple': 5, 'example': 6, 'of': 7, 'text': 8, 'important': 9, 'for': 10, 'nlp': 11}
Sequences: [[3, 1, 4, 5, 6, 7, 8, 2, 2, 1, 9, 10, 11]]
Padded Sequences: [[ 5  6  7  8  2  2  1  9 10 11]]


In [60]:
# 2. Define Word Embeddings
vocab_size = len(word_index) + 1

In [61]:
embedding_dim = 16 #dimension

In [62]:
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=10)

In [63]:
embedded_sequences = embedding_layer(padded_sequences)

print("Embedded Sequences shape:", embedded_sequences.shape)
print("Embedded Sequences:", embedded_sequences)

Embedded Sequences shape: (1, 10, 16)
Embedded Sequences: tf.Tensor(
[[[-3.2426216e-02  3.4641016e-02  4.9418453e-02 -5.5081137e-03
   -3.7299536e-02  3.0639920e-02 -4.8702504e-02  2.5892463e-02
    3.4027744e-02 -4.1327596e-02 -2.0136142e-02  3.9389480e-02
    2.1365371e-02 -2.1932734e-02 -4.3475568e-02  4.9599554e-02]
  [ 1.5686762e-02  4.2545583e-02 -2.8761148e-02 -2.0409977e-02
    2.3830008e-02  4.9624894e-02 -3.1038094e-02 -1.9953383e-02
   -4.4345092e-02 -1.9153321e-02  4.2004954e-02  3.3015694e-02
   -1.5031528e-02  4.3350127e-02 -4.8912689e-04  3.0350748e-02]
  [ 3.5907414e-02  4.0293224e-03  2.0123351e-02  4.2438354e-02
   -1.9048179e-02  1.9976329e-02 -1.1644147e-02 -2.1293139e-02
    2.7555499e-02  3.2838691e-02  1.2189411e-02  9.6194819e-04
   -2.5600994e-02  2.9320780e-02  2.6475731e-02 -2.7577365e-02]
  [ 3.4840349e-02 -4.1834425e-02  3.9689969e-02 -7.9488046e-03
    3.3611406e-02 -2.6012436e-03 -3.0045286e-03  2.8687824e-02
    9.9061802e-04  4.4446792e-02  4.5256410e-0

In [64]:
#3. Build the CNN Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=10),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(16, activation='relu'),
    Dropout(0.5), # Add dropout for regularization
    Dense(1, activation='sigmoid') # Binary classification example
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [67]:
train_texts = ["good movie", "bad movie"]
train_labels = [1, 0]

tokenizer_train = Tokenizer()
tokenizer_train.fit_on_texts(train_texts)
sequences_train = tokenizer_train.texts_to_sequences(train_texts)
padded_train = pad_sequences(sequences_train, maxlen=10)

# Convert train_labels to a NumPy array
train_labels = np.array(train_labels)

model.fit(padded_train, train_labels, epochs=5, verbose=0)

loss, accuracy = model.evaluate(padded_train, train_labels, verbose=0)
print(f"Loss: {loss}, Accuracy: {accuracy}")

new_texts = ["very good movie"]
sequences_new = tokenizer_train.texts_to_sequences(new_texts)
padded_new = pad_sequences(sequences_new, maxlen=10)
predictions = model.predict(padded_new, verbose=0)
print("Predictions:", predictions)

Loss: 0.6849145889282227, Accuracy: 0.5
Predictions: [[0.48656967]]
