In [None]:
cd /content/drive/MyDrive


/content/drive/MyDrive


SyntaxError: ignored

In [None]:
!git clone https://github.com/wasiahmad/PolicyQA.git


Cloning into 'PolicyQA'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 28 (delta 9), reused 9 (delta 1), pack-reused 0[K
Unpacking objects: 100% (28/28), done.


In [None]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.9 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3


In [None]:
cd PolicyQA/


/content/drive/MyDrive/PolicyQA


In [None]:
import json
import os
import re
import string
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
os.environ['CUDA_VISIBLE_DEVICES'] = '0'


# ============================================= PREPARING DATASET ======================================================
class Sample:
    def __init__(self, question, context, start_char_idx=None, answer_text=None, all_answers=None):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False
        self.start_token_idx = -1
        self.end_token_idx = -1

    def preprocess(self):
        context = " ".join(str(self.context).split())
        question = " ".join(str(self.question).split())
        tokenized_context = tokenizer.encode(context)
        tokenized_question = tokenizer.encode(question)
        if self.answer_text is not None:
            answer = " ".join(str(self.answer_text).split())
            end_char_idx = self.start_char_idx + len(answer)
            if end_char_idx >= len(context):
                self.skip = True
                return
            is_char_in_ans = [0] * len(context)
            for idx in range(self.start_char_idx, end_char_idx):
                is_char_in_ans[idx] = 1
            ans_token_idx = []
            for idx, (start, end) in enumerate(tokenized_context.offsets):
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                self.skip = True
                return
            self.start_token_idx = ans_token_idx[0]
            self.end_token_idx = ans_token_idx[-1]
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.skip = True
            return
        self.input_word_ids = input_ids
        self.input_type_ids = token_type_ids
        self.input_mask = attention_mask
        self.context_token_to_char = tokenized_context.offsets


def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                if "answers" in qa:
                    answer_text = qa["answers"][0]["text"]
                    all_answers = [_["text"] for _ in qa["answers"]]
                    start_char_idx = qa["answers"][0]["answer_start"]
                    squad_eg = Sample(question, context, start_char_idx, answer_text, all_answers)
                else:
                    squad_eg = Sample(question, context)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y



class ValidationCallback(keras.callbacks.Callback):

    def normalize_text(self, text):
        text = text.lower()
        text = "".join(ch for ch in text if ch not in set(string.punctuation))
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        text = re.sub(regex, " ", text)
        text = " ".join(text.split())
        return text

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def compute_f1(self,pred_tokens, truth_tokens):      
        # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
        if len(pred_tokens) == 0 or len(truth_tokens) == 0:
            return int(pred_tokens == truth_tokens)
        
        common_tokens = set(pred_tokens) & set(truth_tokens)
        
        # if there are no common tokens then f1 = 0
        if len(common_tokens) == 0:
            return 0
        
        prec = len(common_tokens) / len(pred_tokens)
        rec = len(common_tokens) / len(truth_tokens)
        
        return 2 * (prec * rec) / (prec + rec)
    
    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        f1_scores = []
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = eval_examples_no_skip[idx]
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
            normalized_pred_ans = self.normalize_text(pred_ans)
            normalized_true_ans = [self.normalize_text(_) for _ in squad_eg.all_answers]
            if normalized_pred_ans in normalized_true_ans:
                count += 1
            pred_tokens = normalized_pred_ans.split()
            truth_tokens = [a.split() for a in normalized_true_ans]
            f1_scores.append(max((self.compute_f1(pred_tokens, answer)) for answer in truth_tokens))
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch + 1}, exact match score={acc:.2f}")
        print(f'\nepoch={epoch +1}, f1 score = {sum(f1_scores)/len(f1_scores)}')

In [None]:
train_path = 'data/train.json'
eval_path = 'data/dev.json'
with open(train_path) as f:
  raw_train_data = json.load(f)
with open(eval_path) as f:
  raw_eval_data = json.load(f)
max_seq_length = 512

In [None]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")

17056 training points created.


In [None]:
print(pooled_output.get_shape())
print(sequence_output.get_shape())

(None, 768)
(None, 512, 768)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")

3809 evaluation points created.


In [None]:

start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten()(start_logits)
end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten()(end_logits)
start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)
model = keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[start_probs, end_probs])
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
checkpoint_path = "/content/drive/MyDrive/PolicyQA/model_checkpoint/training_cp-{epoch:04d}.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=False,
    mode='auto',
    period=1
    )

# Save the weights using the `checkpoint_path` format
model.save_weights(checkpoint_path.format(epoch=0))






In [None]:
model = keras.models.load_model('/content/drive/MyDrive/PolicyQA/model_checkpoint/training_cp-0002.h5',custom_objects={'KerasLayer':hub.KerasLayer})

In [None]:
model.fit(x_train, y_train, epochs=1, batch_size=4, callbacks=[ValidationCallback(x_eval, y_eval),cp_callback])




TypeError: ignored

In [None]:
model.save('/content/drive/MyDrive/PolicyQA/models/model_checkpoint.h5')

In [None]:
# test_path='data/test.json'
# with open(test_path) as f: raw_test_data = json.load(f)
raw_test_data={"data": [{"title": "", 
          "paragraphs": 
          [{ "context": "The Adobe Privacy Policy describes the privacy practices of Adobe apps and websites. If you are a resident of North America, your relationship is with Adobe Inc and the laws of California and the United States apply. If you reside outside of North America, your relationship is with Adobe Systems Software Ireland Limited, which is the controller with regard to your personal information collected by Adobe and the laws of Ireland apply. Please note that in order to use our apps and websites, you authorise Adobe to transfer your personal information across national borders and to other countries where Adobe and its partners operate, including the United States. The privacy protections and rights of authorities to access your information in these countries may not be equivalent to those in your country. We will only transfer your personal information to these countries where permitted to do so by law and we will take steps intended to ensure that your personal information continues to receive appropriate protections. If the content or information that you store on Adobe apps or websites contains personal information of other individuals, you must be legally permitted to share the personal information with Adobe. We will obtain your permission before sending you news and promotional material about Adobe, accessing information stored on your device relating to your use and engagement with, websites and apps and crash reports, and analysing your content. You can withdraw your consent to such activities at any time. This policy explains when we process personal information for our legitimate interests. You can ask us to stop processing this information. We use your personal information to enable you to register with Adobe and to provide you with our websites and apps and other products or services that you request. We provide interactive features that engage with social media sites, such as Facebook. If you use these features, these sites will send us personal information about you. We use cookies and other technologies to track the use of our websites and apps.. ", 
              "qas": 
            [{"question": "What does this policy describe?", 
              "id": "43d0tj7wcdmhwadk"},                          
             {"question": "What if I am a resident outside of America?", 
              "id": "knyp7n1i9r35ci82"},           
             {"question": "Will they expose the data collected from me in social networking sites?", 
              "id": "6isrs6pl65f7ueuf"},              
              {"question": "Are my websites and apps tracked by cookies?", 
               "id": "xslxbpslfpt535le"}, 
             {"question": "Do you follow confidentiality obligations?", 
              "id": "qnabo06neuot52m1"
             }
            ]}]}]}

In [None]:
test_samples = create_squad_examples(raw_test_data)
x_test, _ = create_inputs_targets(test_samples)
pred_start, pred_end = model.predict(x_test)
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    test_sample = test_samples[idx]
    offsets = test_sample.context_token_to_char
    start = np.argmax(start)
    end = np.argmax(end)
    pred_ans = None
    if start >= len(offsets):
        continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
        pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
    else:
        pred_ans = test_sample.context[pred_char_start:]
    print("Q: " + test_sample.question)
    print("A: " + pred_ans)

Q: What does this policy describe?
A: The Adobe Privacy Policy describes the privacy practices of Adobe apps and websites. If you are a resident of North America, your relationship is with Adobe Inc and the laws of California and the United States apply.
Q: What if I am a resident outside of America?
A: If you are a resident of North America, your relationship is with Adobe Inc and the laws of California and the United States apply. If you reside outside of North America
Q: Will they expose the data collected from me in social networking sites?
A: We will only transfer your personal information to these countries where permitted to do so by law and we will take steps intended to ensure that your personal information continues to receive appropriate protections.
Q: Are my websites and apps tracked by cookies?
A: cookies and other technologies
Q: Do you follow confidentiality obligations?
A: you authorise Adobe to transfer your personal information across national borders and to other co

In [None]:
raw_test_data={"data": [{"title": "", 
          "paragraphs": 
          [{ "context": "Please take a moment to review some changes to our Terms and Data Policy . Your Instagram experience isnt changing, and you still own your photos and videos. We are giving you better ways to access your data and understand how its used. By continuing to use Instagram on or after July 14, 2018, you're agreeing to these updates. Instagram has been a part of Facebook since 2012, and we're making some corporate changes. Going forward, our Terms will reflect that Facebook Inc. is responsible for Instagram. The Instagram app and the way we process data are not changing. Our Terms are now more clear about the service we provide, and what we expect from every member of our community to keep Instagram a safe place for everyone. Here are some updates we want to make sure you know about: We updated our intellectual property licenses, but your rights aren't changing. You still own your photos and videos. We updated how we use information to show activity on Instagram, so people can see when you've interacted with an ad the same way we do on a regular post. We also have a new Data Policy that explains how data is collected, shared and used in the Facebook Products, including Instagram. The policy addresses newer features like stories, direct messaging, activity status and the creative tools in our cameras. We wanted to make sure you knew about this new information in the policy. We receive different kinds of information from your device, like how you tap and scroll, which can help distinguish humans from bots and detect fraud. We can use and share information for research, especially in ways that help us keep our community safe on Instagram, like to understand and prevent bullying and harassment. The policy has more information about what we collect from your activity and our partners, how we connect information across the Facebook Companies and how we personalize your experience, including ads. We provide ads without telling advertisers who you are. The policy has more information about what we do share with advertisers and partners. We never sell your data. Because the policy also covers Facebook, it includes information about facial recognition. We dont use facial recognition technology on Instagram. If we introduce it, we will let you know and give you a choice.", 
              "qas": 
            [{"question": "What would my cookies be used for?", 
              "id": "43d0tj7wcdmhwadk"},              
             {"question": "Why would Instagram store my information?",  
              "id": "hjwapte7oki8t3l5"},              
             {"question": "Why would Instagram keep records about my browsing activity?", 
              "id": "knyp7n1i9r35ci82"},           
             {"question": "Why would Instagram share my information with external organisations?", 
              "id": "6isrs6pl65f7ueuf"},              
              {"question": "Why does Instagram collect my user name, language or my region?", 
               "id": "xslxbpslfpt535le"}, 
             {"question": "What kind of advertising would I receive by Instagram or its advertising partners?", 
              "id": "qnabo06neuot52m1"
             }
            ]}]}]}

In [None]:
def normalize_text(text):
  text = text.lower()
  text = "".join(ch for ch in text if ch not in set(string.punctuation))
  regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
  text = re.sub(regex, " ", text)
  text = " ".join(text.split())
  return text

def compute_f1(pred_tokens, truth_tokens):      
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  return 2 * (prec * rec) / (prec + rec)


In [None]:
def compute():
  pred_start, pred_end = model.predict(x_eval)
  count = 0
  f1_scores = []
  eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
  for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    squad_eg = eval_examples_no_skip[idx]
    offsets = squad_eg.context_token_to_char
    start = np.argmax(start)
    end = np.argmax(end)
    if start >= len(offsets):
      continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
      pred_char_end = offsets[end][1]
      pred_ans = squad_eg.context[pred_char_start:pred_char_end]
    else:
      pred_ans = squad_eg.context[pred_char_start:]
    normalized_pred_ans = normalize_text(pred_ans)
    normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
    if normalized_pred_ans in normalized_true_ans:
      count += 1
    pred_tokens = normalized_pred_ans.split()
    truth_tokens = [a.split() for a in normalized_true_ans]
    f1_scores.append(max((compute_f1(pred_tokens, answer)) for answer in truth_tokens))
  acc = count / len(y_eval[0])
  print(f"\nexact match score={acc:.2f}")
  print(f'\nf1 score = {sum(f1_scores)/len(f1_scores)}')

In [None]:
compute()


exact match score=0.29

f1 score = 0.5518181564587945
