<a href="https://colab.research.google.com/github/Twinkle-gawri/Word2Vec/blob/main/Extrinsic_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EXTRINSIC EVALUATION
Evaluating by using it inside a real-world task and checking performance.

Real tasks could be:
* Sentiment analysis
* Text classification
* Machine translation
* Question answering
* Information retrieval

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential,layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import imdb
import numpy as np

In [2]:
!pip install gensim



In [3]:
from gensim import downloader

In [9]:
wv=downloader.load('word2vec-google-news-300')

In [8]:
(x_train,y_train),(x_test,y_test)=imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [11]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [4]:
word2index=imdb.get_word_index() # converts words -> indices

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [5]:
index2word={index:word for word,index in word2index.items()} # converts indices -> words

In [6]:
def decode_review(encoded_review):
  return ' '.join([index2word.get(i,"?") for i in encoded_review])   # Uses index2word.get(i, "?") to convert each index back to its corresponding word.
                                # If the index i isn’t found in the dictionary (e.g., it's an unknown word), it returns "?".

In [9]:
x_train_text = [decode_review(encoded_review) for encoded_review in x_train] # converting the lists present into x_train into human readable form
x_test_text = [decode_review(encoded_review) for encoded_review in x_test]

In [16]:
def sentence_embeddings(sentence):
  words=sentence.split()
  vectors=[wv[word] for word in words if word in wv.key_to_index]  # wv.key_to_index is a dictionary of all the words in the embedding model.
  return np.mean(vectors, axis=0) if vectors else np.zeros(300) # computes the mean of all word vectors in the sentence - averaging across each dimension of the word vectors

In [17]:
train_embeddings=np.array([sentence_embeddings(sentence) for sentence in x_train_text])

test_embeddings=np.array([sentence_embeddings(sentence) for sentence in x_test_text])

In [18]:
"""
Takes in pairs of data and labels.
Creates a dataset where each item looks like (embedding, label).
"""
train_dataset=tf.data.Dataset.from_tensor_slices((train_embeddings,y_train)).batch(32)
test_dataset=tf.data.Dataset.from_tensor_slices((test_embeddings,y_test)).batch(32)

In [19]:
model=Sequential(
    [
        layers.Input(shape=(300,)),
        Dense(256,activation='relu'),
        Dense(128,activation='relu'),
        Dense(1,activation='sigmoid')
    ]
)

In [16]:
model.compile(loss=keras.losses.BinaryCrossentropy(3e-4),
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [17]:
model.fit(train_dataset,validation_data=test_dataset,epochs=20,verbose=1)

Epoch 1/20


  output, from_logits = _get_logits(


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5539 - loss: 0.6757 - val_accuracy: 0.6807 - val_loss: 0.5992
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6788 - loss: 0.5980 - val_accuracy: 0.7017 - val_loss: 0.5769
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7002 - loss: 0.5781 - val_accuracy: 0.7052 - val_loss: 0.5687
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7035 - loss: 0.5709 - val_accuracy: 0.7102 - val_loss: 0.5639
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7053 - loss: 0.5686 - val_accuracy: 0.7152 - val_loss: 0.5609
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7046 - loss: 0.5662 - val_accuracy: 0.7143 - val_loss: 0.5604
Epoch 7/20
[1m782/782[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7e09e48fd990>

# Using TensorFlow Hub Universal Sentence Encoder
It converts entire sentences (or paragraphs) into embeddings that capture the meaning of the sentence.

In [1]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [2]:
import numpy as np

In [21]:
import tensorflow_hub as hub
import numpy as np

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Batch size for processing the data
batch_size = 32

def embed_in_batches(texts, batch_size):
  """converts a big list of sentences into sentence embeddings, but processes them in smaller batches (instead of all at once)train_embeddings_1.shape."""
  all_embeddings = []
  for i in range(0, len(texts), batch_size):
    batch_embeddings = embed(texts[i : i + batch_size])
    all_embeddings.extend(batch_embeddings)
  return np.array(all_embeddings)

# Embed the training and test data in batches
train_embeddings_1 = embed_in_batches(x_train_text, batch_size)
test_embeddings_1 = embed_in_batches(x_test_text, batch_size)

In [22]:
train_embeddings_1.shape

(25000, 512)

In [23]:
model1=Sequential(
    [
        layers.Input(shape=(512,)),
        Dense(256,activation='relu'),
        Dense(128,activation='relu'),
        Dense(1,activation='sigmoid')
    ]
)

In [24]:
model1.compile(loss=keras.losses.BinaryCrossentropy(3e-4),
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [25]:
model1.fit(train_embeddings_1,y_train,validation_data=(test_embeddings_1,y_test),batch_size=32,epochs=20,verbose=1)

Epoch 1/20


  output, from_logits = _get_logits(


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.6058 - loss: 0.6519 - val_accuracy: 0.6582 - val_loss: 0.6145
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 0.6599 - loss: 0.6106 - val_accuracy: 0.6627 - val_loss: 0.6115
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.6718 - loss: 0.5993 - val_accuracy: 0.6540 - val_loss: 0.6242
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.6898 - loss: 0.5821 - val_accuracy: 0.6674 - val_loss: 0.6111
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.7050 - loss: 0.5611 - val_accuracy: 0.6557 - val_loss: 0.6243
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7347 - loss: 0.5270 - val_accuracy: 0.6592 - val_loss: 0.6431
Epoch 7/20
[1m782/782[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x79cc35aef290>

# Using TensorFlow Hub Embeddings

In [11]:
import tensorflow_hub as hub
embedding = hub.load("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1")

In [12]:
"""
This function is trying to create a sentence embedding by:
  Splitting the sentence into words,
  Getting an embedding for each word individually (using a model called embedding),
  Then averaging all those word vectors together to form one vector for the sentence.

When you call embedding([word]), you get a 2D vector (shape (1, 300) or (1, 512))
When you do [0], you convert it into a 1D vector (shape (300,) or (512,)).
Model training expects 1D embeddings (not 2D batches) when building a dataset of sentence/word embeddings.
"""

def sentence_embeddings1(sentence):
  words=sentence.split()
  vectors=[embedding([word])[0] for word in words if embedding([word])[0] is not None]
  return np.mean(vectors, axis=0) if vectors else np.zeros(300)

In [None]:
train_embeddings_2 = np.array([sentence_embeddings1(sentence) for sentence in x_train_text])
test_embeddings_2 = np.array([sentence_embeddings1(sentence) for sentence in x_test_text])

In [None]:
model2=Sequential(
    [
        layers.Input(shape=(20,)),
        Dense(256,activation='relu'),
        Dense(128,activation='relu'),
        Dense(1,activation='sigmoid')
    ]
)

In [None]:
model2.compile(loss=keras.losses.BinaryCrossentropy(3e-4),
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
model2.fit(train_embeddings_2,y_train,validation_data=(test_embeddings_2,y_test),batch_size=32,epochs=20,verbose=1)