###Import Dataset

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('https://archive.org/download/fine-tune-bert-tensorflow-train.csv/train.csv.zip',
                 compression='zip',
                 low_memory=False)

df.shape

###first few rows of dataset.

In [None]:
df.head()

###remove 'qid' from the dataset.

In [None]:
df = df.drop('qid', axis=1)
df.shape

###split the dataset

In [None]:
from sklearn.model_selection import train_test_split

# First split: 80% train, 20% temporary
train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(
    df['question_text'].to_numpy(),
    df['target'].to_numpy(),
    test_size=0.2,
    random_state=42
)

# Second split: 10% validation, 10% test (from the 20% temporary set)
val_sentences, test_sentences, val_labels, test_labels = train_test_split(
    temp_sentences,
    temp_labels,
    test_size=0.5,
    random_state=42
)

print(f"Training set size: {len(train_sentences)}")
print(f"Validation set size: {len(val_sentences)}")
print(f"Test set size: {len(test_sentences)}")

###Tokenization + Padding

In [9]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters for padding and OOV tokens
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_size = 10000
max_length = 32

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(train_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
training_labels = np.array(train_labels)
val_labels = np.array(val_labels)

###Example for Tokenization

In [None]:
example_sequence = tokenizer.texts_to_sequences(["Do you have an adopted dog, how would you encourage people to adopt and not shop?"])
example_sequence

###Example for padding

In [None]:
example_padded= pad_sequences(example_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
example_padded

###Create the Model

In [None]:
import tensorflow as tf

# Output dimensions of the Embedding layer
embedding_dim = 16
vocab_size = 10000
max_length = 32

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.build(input_shape=(None, max_length))
model.summary()

###Execute following code cell to get understanding about GlobalAveragePooling1D() layer

In [None]:
# Initialize a GlobalAveragePooling1D layer
gap1d_layer = tf.keras.layers.GlobalAveragePooling1D()

# Define sample array
sample_array = np.array([[[20,4],[3,9],[10,10]]])

# Print shape and contents of sample array
print(f'shape of sample_array = {sample_array.shape}')
print(f'sample array: {sample_array}')

# put sample array as input ot GlobalAveragePooling1D layer
output = gap1d_layer(sample_array)

# Print shape and contents of the GlobalAveragePooling1D output array
print(f'output shape of gap1d_layer: {output.shape}')
print(f'output array of gap1d_layer: {output.numpy()}')

###Compile the model

In [None]:
#compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Train the model
history = model.fit(training_padded, training_labels, epochs=30, validation_data=(val_padded, val_labels), verbose=2)

###Plot Accuracy and Loss of the model with each epoch

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

###Get the index-word dictionary

In [None]:
# Get the index-word dictionary
reverse_word_index = tokenizer.index_word

# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

In [20]:
import io

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, vocab_size):

  # Get the word associated at the current index
  word_name = reverse_word_index[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()

###To download, these files into your local machine,

In [None]:
# Import files utilities in Colab
try:
  from google.colab import files
except ImportError:
  pass

# Download the files
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

###Get Token for any Word

In [None]:
# Input a word
word = input("Enter a word: ")

# Get the index of the word
word_idx = tokenizer.word_index.get(word)

if word_idx:
  print(f"Token for '{word}': {word_idx}")
else:
  print(f"The word '{word}' is not in the vocabulary.")


###Get the word embeddings for any word

In [None]:
# Get the embedding layer from the model
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Get the word index
word_index = tokenizer.word_index

# Input a word
word = input("Enter a word: ")

# Get the index of the word
word_idx = word_index.get(word)

if word_idx:
  # Get the embedding vector for the word
  embedding_vector = embedding_weights[word_idx]
  print(f"Embedding vector for '{word}': {embedding_vector}")
else:
  print(f"The word '{word}' is not in the vocabulary.")
