### Importing labreries

In [59]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras import layers

### Data Preprocessing:

Read the TSV file.

In [60]:
file_path = "./data/newData.tsv"

df = pd.read_csv(file_path, sep='\t',usecols=["question_body", "accepted_answer"], header=0)

df.head()

Unnamed: 0,question_body,accepted_answer
0,"""<p>I have this Script that works and does wha...","""<p>Only rows that are visible in the datatabl..."
1,<p>I have below code:</p> <pre><code>*** Setti...,"""<p>If you extract the for loop from the test ..."
2,"""<p>Anyone know how I can add a smooth transit...",<p>You can add a smooth scroll effect with css...
3,<p>I want to do something similar to what I ca...,"""<p>Since the initial request to a Blazor Serv..."
4,<p>I want to set <code>Build Action</code> of ...,<pre><code>&lt;ItemGroup&gt; &lt;!-- This...


Preprocess the text data by cleaning, tokenizing, and padding sequences as necessary.

In [61]:
# Function to clean the text
def clean_text(text):
    # Remove HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    # Remove special characters
    clean_text = re.sub(r'[^a-zA-Z\s]', '', clean_text)
    # Convert to lowercase
    clean_text = clean_text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(clean_text)
    clean_words = [word for word in words if word not in stop_words]
    # Join the words back into a string
    clean_text = ' '.join(clean_words)
    return clean_text


# Clean the question_body and accepted_answer columns
df['question_body'] = df['question_body'].apply(clean_text)
df['accepted_answer'] = df['accepted_answer'].apply(clean_text)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['question_body'] + df['accepted_answer'])
word_index = tokenizer.word_index

# Convert text to sequences and pad sequences
max_length = 100  # Set the maximum sequence length
question_sequences = tokenizer.texts_to_sequences(df['question_body'])
accepted_answer_sequences = tokenizer.texts_to_sequences(df['accepted_answer'])

# Pad sequences to ensure uniform length
question_sequences_padded = pad_sequences(question_sequences, maxlen=max_length, padding='post', truncating='post')
accepted_answer_sequences_padded = pad_sequences(accepted_answer_sequences, maxlen=max_length, padding='post', truncating='post')

In [62]:
df.head()

Unnamed: 0,question_body,accepted_answer
0,script works need apply first rows next page t...,rows visible datatable actually dom therefore ...
1,code settings library operatingsystem library ...,extract loop test template test case use templ...
2,anyone know add smooth transition effect tried,add smooth scroll effect css html scrollbehavi...
3,want something similar react react app renders...,since initial request blazor server app perfor...
4,want set build action two files embedded resou...,ltitemgroupgt lt line includes json files fold...


In [63]:
question_sequences_padded

array([[ 121,   71,   10, ...,    0,    0,    0],
       [   2,  652,  233, ...,    0,    0,    0],
       [ 499,   65,   33, ...,    0,    0,    0],
       ...,
       [ 130,   36,  116, ...,    0,    0,    0],
       [1988,  172,   27, ...,    0,    0,    0],
       [1926,  236,  813, ...,    0,    0,    0]])

In [64]:
accepted_answer_sequences_padded

array([[ 169, 1149, 1143, ...,    0,    0,    0],
       [ 470,  104,  108, ...,    0,    0,    0],
       [  33, 3525,  806, ...,    0,    0,    0],
       ...,
       [1216, 2801,  975, ...,    0,    0,    0],
       [  45, 1854,   20, ...,    0,    0,    0],
       [ 423,  774,  374, ...,    0,    0,    0]])

Split the data into training and validation sets.

In [65]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(question_sequences_padded, accepted_answer_sequences_padded, test_size=0.2, random_state=42)

print("Training shape:", X_train.shape)
print("Validation shape:", X_val.shape)


Training shape: (31529, 100)
Validation shape: (7883, 100)
Training set shape: (3152900, 1)
Validation set shape: (788300, 1)


In [66]:
X_train

array([[   49,  6656,  5643, ...,     0,     0,     0],
       [  177,   827,  1297, ...,     0,     0,     0],
       [  275,   228,    40, ...,     0,     0,     0],
       ...,
       [    6,     8,    22, ...,     0,     0,     0],
       [  133,   606,   950, ...,     0,     0,     0],
       [30026,   745,    51, ...,     0,     0,     0]])

### Model Architecture

Define the Transformer model architecture with encoder and decoder components.

In [68]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the scaled dot product attention.

    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
            to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
      output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [69]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        
        print("q shape before projection:", q.shape)
        print("k shape before projection:", k.shape)
        print("v shape before projection:", v.shape)
        print("mask shape before projection:", mask.shape)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        print("q shape after projection:", q.shape)
        print("k shape after projection:", k.shape)
        print("v shape after projection:", v.shape)
        print("mask shape after projection:", mask.shape)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [70]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [71]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates


In [72]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [73]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [74]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x

In [75]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        self.dropout3 = layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [76]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [77]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)
        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output, attention_weights

### Training

Define custom loss function and learning rate scheduler.

In [78]:
# Define custom loss function
def custom_loss_function(y_true, y_pred):
    # Define your custom loss calculation here
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    return loss

In [79]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = tf.cast(warmup_steps, tf.float32)

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

Compile the model with appropriate optimizer and metrics.

In [80]:
d_model = 512
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [81]:
# Compile the model
# model.compile(optimizer=optimizer, loss=custom_loss_function, metrics=['accuracy'])

Train the model on the training data.

In [82]:
import time

In [92]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # Expand dimensions to add padding to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)


def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)  # Shape: (batch_size, 1, 1, seq_len_enc)

    # Decoder padding mask
    dec_padding_mask = create_padding_mask(inp)  # Shape: (batch_size, 1, 1, seq_len_enc)

    # Look-ahead mask for decoder self-attention
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])  # Shape: (1, seq_len_tar, seq_len_tar)
    dec_target_padding_mask = create_padding_mask(tar)  # Shape: (batch_size, 1, 1, seq_len_tar)

    # Combine both masks for decoder self-attention
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)  # Shape: (batch_size, 1, seq_len_tar, seq_len_tar)

    # Ensure the correct shape of masks
    enc_padding_mask = tf.expand_dims(enc_padding_mask[:, 0, :], axis=1)  # Shape: (batch_size, 1, 1, seq_len_enc)
    dec_padding_mask = tf.expand_dims(dec_padding_mask[:, 0, :], axis=1)  # Shape: (batch_size, 1, 1, seq_len_enc)
    combined_mask = tf.expand_dims(tf.expand_dims(combined_mask[:, 0, :, :], axis=1), axis=-1)  # Shape: (batch_size, 1, seq_len_tar, seq_len_tar, 1)

    return enc_padding_mask, combined_mask, dec_padding_mask





def train_transformer_model(model, train_dataset, val_dataset, num_epochs, optimizer, loss_function):
    for epoch in range(num_epochs):
        start = time.time()
        train_loss = 0
        val_loss = 0
        for (batch, (inp, tar)) in enumerate(train_dataset):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar)
            with tf.GradientTape() as tape:
                predictions, _ = model(inp, tar[:, :-1], 
                                        True, 
                                        enc_padding_mask, 
                                        combined_mask, 
                                        dec_padding_mask)
                loss = loss_function(tar[:, 1:], predictions)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            train_loss += loss.numpy()
        
        for (batch, (inp, tar)) in enumerate(val_dataset):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar)
            predictions, _ = model(inp, tar[:, :-1], 
                                    False, 
                                    enc_padding_mask, 
                                    combined_mask, 
                                    dec_padding_mask)
            loss = loss_function(tar[:, 1:], predictions)
            val_loss += loss.numpy()
        
        print(f'Epoch {epoch + 1}, Train Loss: {train_loss / len(train_dataset)}, Val Loss: {val_loss / len(val_dataset)}, Time: {time.time() - start} sec')

In [93]:
num_layers = 10
d_model = 128
dff = 512
num_heads = 8
input_vocab_size = target_vocab_size = len(word_index) + 1
dropout_rate = 0.1
batch_size = 64
num_epochs = 10

In [94]:
pe_input = pe_target = max_length
transformer_model = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target)
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')


train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

# Train the model
train_transformer_model(transformer_model, train_dataset, val_dataset, num_epochs, optimizer, loss_function)

q shape before projection: (64, 100, 128)
k shape before projection: (64, 100, 128)
v shape before projection: (64, 100, 128)
mask shape before projection: (64, 1, 1, 100)
q shape after projection: (64, 8, 100, 16)
k shape after projection: (64, 8, 100, 16)
v shape after projection: (64, 8, 100, 16)
mask shape after projection: (64, 1, 1, 100)
q shape before projection: (64, 100, 128)
k shape before projection: (64, 100, 128)
v shape before projection: (64, 100, 128)
mask shape before projection: (64, 1, 1, 100)
q shape after projection: (64, 8, 100, 16)
k shape after projection: (64, 8, 100, 16)
v shape after projection: (64, 8, 100, 16)
mask shape after projection: (64, 1, 1, 100)
q shape before projection: (64, 100, 128)
k shape before projection: (64, 100, 128)
v shape before projection: (64, 100, 128)
mask shape before projection: (64, 1, 1, 100)
q shape after projection: (64, 8, 100, 16)
k shape after projection: (64, 8, 100, 16)
v shape after projection: (64, 8, 100, 16)
mask sh

InvalidArgumentError: Exception encountered when calling layer 'multi_head_attention_250' (type MultiHeadAttention).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [64,8,99,99] vs. [64,1,100,100,1] [Op:AddV2] name: 

Call arguments received by layer 'multi_head_attention_250' (type MultiHeadAttention):
  • v=tf.Tensor(shape=(64, 99, 128), dtype=float32)
  • k=tf.Tensor(shape=(64, 99, 128), dtype=float32)
  • q=tf.Tensor(shape=(64, 99, 128), dtype=float32)
  • mask=tf.Tensor(shape=(64, 1, 100, 100, 1), dtype=float32)

### Evaluation and Prediction:

Implement functions for evaluating the model on the validation set.

Create a function for generating responses to input questions using the trained model.