<a href="https://colab.research.google.com/github/RAHEYO/SQuAD-MRC/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation


In [1]:
# prompt: install tensorflow, pennylane, and all the related libraries that may be needed to use those two libraries.

!pip install autograd tensorflow jax jaxlib
!pip install pennylane
!pip install pennylane-lightning pennylane-lightning[gpu]
!pip install numpy


# prompt: import tensorflow and pennylane

import tensorflow as tf
import pennylane as qml
import numpy as np

Collecting pennylane
  Downloading PennyLane-0.34.0-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting rustworkx (from pennylane)
  Downloading rustworkx-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting semantic-version>=2.7 (from pennylane)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting autoray>=0.6.1 (from pennylane)
  Downloading autoray-0.6.7-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting pennylane-lightning>=0.34 (from pennylane)
  Downloading PennyLane_Lightning-0.34.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Data Prep

In [None]:
SEED = 10

## Using the Dataset

In [2]:
import tensorflow_datasets as tfds

In [3]:
train = tfds.load('squad/v2.0', split='train', shuffle_files=True)
dev = tfds.load('squad/v2.0', split='validation', shuffle_files=True)

Downloading and preparing dataset 44.34 MiB (download: 44.34 MiB, generated: 148.54 MiB, total: 192.88 MiB) to /root/tensorflow_datasets/squad/v2.0/3.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/130319 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/squad/v2.0/3.0.0.incomplete7ZG3A0/squad-train.tfrecord*...:   0%|         …

Generating validation examples...:   0%|          | 0/11873 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/squad/v2.0/3.0.0.incomplete7ZG3A0/squad-validation.tfrecord*...:   0%|    …

Dataset squad downloaded and prepared to /root/tensorflow_datasets/squad/v2.0/3.0.0. Subsequent calls will reuse this data.


In [4]:
def getContext(ds):
  return ds.map(lambda x: x['context'])

def getQuest(ds):
  return ds.map(lambda x: x['question'])

def getY(ds):
  ds = ds.filter(lambda x: x['is_impossible'] == False)
  return ds.map(lambda x: [x['answers']['answer_start'], x['answers']['answer_start']+len(x['answers']['text'])])

trainTopics = ['Computer', 'IPod', 'Germans', 'Qing_dynasty', 'YouTube', 'Airport', 'United_States_Air_Force', 'Russian_language', 'Armenians', 'Matter']
devTopics = ['Computational_complexity_theory', 'Steam_engine', 'Normans']

context = train.apply(getContext)
quest = train.apply(getQuest)
trainY = train.apply(getY)

In [5]:
for x in list(quest.enumerate(start=0).as_numpy_iterator())[:10]:
  print(x)

(0, b'Who proved that the the star Galileo observed was fixed?')
(1, b'Where are the Maritime Forces Atlantic located?')
(2, b'Which note is occasionally called a largestick?')
(3, b'When did Valencian regain its offical status?')
(4, b'Does the Liberal Party of Australia consider itself socialist or anti-socialist?')
(5, b"In what year did Miami's government declare bankruptcy?")
(6, b'What does RItch SAvin-Williams propose in this paper?')
(7, b"If a translator didn't understand something in a text or thought it too boring to present to readers, what did they do with it?")
(8, b'What console, released in 2005, did Sony create?')
(9, b'When was the Balkan air force formed?')


## Vectorizing Datasets

### Counting Vocabs

In [6]:
contextIter = context.enumerate(start=0)
questIter = quest.enumerate(start=0)

def countSeq(data):
  allWords = []
  maxSeq = 0

  for entry in data:
    words = entry[1].decode().split(" ")

    if len(words) > maxSeq:
      maxSeq = len(words)

    allWords.extend(words)

  vocabs = len(set(allWords))

  return vocabs, maxSeq

contextVocabs, contextMaxSeq = countSeq(contextIter.as_numpy_iterator())
questVocabs, questMaxSeq = countSeq(questIter.as_numpy_iterator())

In [7]:
contextVocabs, contextMaxSeq, questVocabs, questMaxSeq

(185899, 653, 76475, 25601)

In [8]:
# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
context_vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=contextVocabs,
    output_mode='int',
    output_sequence_length=contextMaxSeq)

quest_vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=questVocabs,
    output_mode='int',
    output_sequence_length=questMaxSeq)

In [9]:
context_vectorize_layer.adapt(context)
quest_vectorize_layer.adapt(quest)

In [10]:
context_vectorize_layer.get_vocabulary()[:10]


['', '[UNK]', 'the', 'of', 'and', 'in', 'to', 'a', 'as', 'is']

In [11]:
# Vectorize the data in the dataset. Now all data are numerical vectors to be paired up!
contextVect = context.prefetch(tf.data.AUTOTUNE).map(context_vectorize_layer).unbatch()
questVect = quest.prefetch(tf.data.AUTOTUNE).map(quest_vectorize_layer).unbatch()

In [None]:
contextSeq = list(contextVect.as_numpy_iterator())
contextSeq

### Function to utilize Skip-gram Seq2Vect grouping for Training data formatting

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def skipGram(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [None]:
skipGram(contextSeq, window_size=2, num_ns=4, vocab_size=contextVocabs, seed=SEED)

# Model

## Classical

In [None]:

dev = qml.device("default.qubit", wires=2)

@qml.qnode(dev)
def circuit(params):
    # ...
