In [1]:
!pip install tensorflow
import os
import numpy as np
import tensorflow as tf
import kagglehub

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Dense, LayerNormalization, Dropout,
    MultiHeadAttention, GlobalAveragePooling1D,
    Concatenate, Embedding
)
from tensorflow.keras.models import Model

path = kagglehub.dataset_download(
    "roblexnana/the-babi-tasks-for-nlp-qa-system"
)
print("Dataset downloaded at:", path)
def find_file(base_path, filename):
    for root, _, files in os.walk(base_path):
        if filename in files:
            return os.path.join(root, filename)
    raise FileNotFoundError(filename)

train_file = find_file(path, "qa1_single-supporting-fact_train.txt")
test_file  = find_file(path, "qa1_single-supporting-fact_test.txt")

print("Train file:", train_file)
print("Test file :", test_file)

def parse_babi(file_path):
    stories, questions, answers = [], [], []
    story = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            nid, text = line.split(" ", 1)

            if nid == "1":
                story = []

            if "\t" in text:
                q, a, _ = text.split("\t")
                stories.append(" ".join(story))
                questions.append(q)
                answers.append(a)
            else:
                story.append(text)

    return stories, questions, answers

train_stories, train_questions, train_answers = parse_babi(train_file)
test_stories, test_questions, test_answers = parse_babi(test_file)

print("Train samples:", len(train_stories))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(
    train_stories + train_questions + train_answers
)

vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

max_story_len = max(len(s.split()) for s in train_stories)
max_question_len = max(len(q.split()) for q in train_questions)

def vectorize(stories, questions, answers):
    s = tokenizer.texts_to_sequences(stories)
    q = tokenizer.texts_to_sequences(questions)
    a = np.array([tokenizer.word_index[x] for x in answers])

    s = pad_sequences(s, maxlen=max_story_len)
    q = pad_sequences(q, maxlen=max_question_len)

    return s, q, a

x_story, x_question, y = vectorize(
    train_stories, train_questions, train_answers
)

x_story_test, x_question_test, y_test = vectorize(
    test_stories, test_questions, test_answers
)



class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = Embedding(vocab_size, embed_dim)
        self.pos_emb = Embedding(max_len, embed_dim)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1])
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


def transformer_encoder(x, head_size, num_heads, ff_dim):
    attn = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=head_size
    )(x, x)

    x = LayerNormalization(epsilon=1e-6)(x + attn)

    ffn = Dense(ff_dim, activation="relu")(x)
    ffn = Dense(x.shape[-1])(ffn)   # must match embed_dim

    return LayerNormalization(epsilon=1e-6)(x + ffn)



embed_dim = 64

story_input = Input(shape=(max_story_len,))
question_input = Input(shape=(max_question_len,))

story_embed = PositionalEmbedding(
    max_story_len, vocab_size, embed_dim
)(story_input)

question_embed = PositionalEmbedding(
    max_question_len, vocab_size, embed_dim
)(question_input)

story_encoded = transformer_encoder(
    story_embed, head_size=32, num_heads=2, ff_dim=64
)

question_encoded = transformer_encoder(
    question_embed, head_size=32, num_heads=2, ff_dim=64
)


qa_attention = MultiHeadAttention(
    num_heads=2, key_dim=32
)(
    query=question_encoded,
    value=story_encoded,
    key=story_encoded
)

story_vec = GlobalAveragePooling1D()(qa_attention)
question_vec = GlobalAveragePooling1D()(question_encoded)

merged = Concatenate()([story_vec, question_vec])

output = Dense(vocab_size, activation="softmax")(merged)

model = Model(
    inputs=[story_input, question_input],
    outputs=output
)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

model.fit(
    [x_story, x_question],
    y,
    batch_size=32,
    epochs=30,
    validation_split=0.1
)

loss, acc = model.evaluate(
    [x_story_test, x_question_test],
    y_test
)

print("Final Transformer QA Accuracy:", acc)

Using Colab cache for faster access to the 'the-babi-tasks-for-nlp-qa-system' dataset.
Dataset downloaded at: /kaggle/input/the-babi-tasks-for-nlp-qa-system
Train file: /kaggle/input/the-babi-tasks-for-nlp-qa-system/tasks_1-20_v1-2/shuffled-10k/qa1_single-supporting-fact_train.txt
Test file : /kaggle/input/the-babi-tasks-for-nlp-qa-system/tasks_1-20_v1-2/shuffled-10k/qa1_single-supporting-fact_test.txt
Train samples: 10000
Vocabulary size: 20


Epoch 1/30
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 46ms/step - accuracy: 0.3440 - loss: 1.6822 - val_accuracy: 0.5080 - val_loss: 1.1576
Epoch 2/30
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.5083 - loss: 1.1141 - val_accuracy: 0.5060 - val_loss: 1.1035
Epoch 3/30
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 40ms/step - accuracy: 0.5100 - loss: 1.0887 - val_accuracy: 0.5360 - val_loss: 1.0605
Epoch 4/30
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.5295 - loss: 1.0435 - val_accuracy: 0.5090 - val_loss: 1.0743
Epoch 5/30
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 45ms/step - accuracy: 0.5315 - loss: 1.0458 - val_accuracy: 0.4950 - val_loss: 1.0657
Epoch 6/30
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 40ms/step - accuracy: 0.5223 - loss: 1.0435 - val_accuracy: 0.4950 - val_loss: 1.0952
Epoch 7/30
[1m2