In [None]:
# Run in Colab to install local packages
!pip install transformers sentencepiece torch datasets sentence-transformers
!pip install -U transformers==3.0.0

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.2 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.6 MB/s 
Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 49.6 MB/s 
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 8.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |███

In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!git clone https://github.com/patil-suraj/question_generation.git

Cloning into 'question_generation'...
remote: Enumerating objects: 268, done.[K
remote: Counting objects: 100% (112/112), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 268 (delta 101), reused 95 (delta 95), pack-reused 156[K
Receiving objects: 100% (268/268), 289.86 KiB | 2.61 MiB/s, done.
Resolving deltas: 100% (147/147), done.


In [None]:
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
%cd question_generation

/content/question_generation


In [None]:
from pipelines import pipeline

In [None]:
nlp = pipeline("multitask-qa-qg", model="valhalla/t5-base-qa-qg-hl")

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# import pandas as pd

# # read training data
# df = pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/Data/slp3ed.csv")
# contexts = df["text"].tolist()

# print(contexts[0])


The dialogue above is from ELIZA, an early natural language processing system ELIZA that could carry on a limited conversation with a user by imitating the responses of a Rogerian psychotherapist (Weizenbaum, 1966). ELIZA is a surprisingly simple program that uses pattern matching to recognize phrases like \"I need X\" and translate them into suitable outputs like \"What would it mean to you if you got X?\". This simple technique succeeds in this domain because ELIZA doesn't actually need to know anything to mimic a Rogerian psychotherapist. As Weizenbaum notes, this is one of the few dialogue genres where listeners can act as if they know nothing of the world. Eliza's mimicry of human conversation was remarkably successful: many people who interacted with ELIZA came to believe that it really understood them and their problems, many continued to believe in ELIZA's abilities even after the program's operation was explained to them (Weizenbaum, 1976), and even today such chatbots are a f

In [None]:
contexts = []
with open('/content/drive/MyDrive/Colab Notebooks/Data/clean-contexts.txt', 'r') as f:
    for context in f:
        contexts.append(context)

print(len(contexts))

1201


In [None]:
text = "The regular expression [ /[1234567890]/ ] specifies any single digit. While such classes of characters as digits or letters are important building blocks in expressions, they can get awkward (e.g., it’s inconvenient to specify [ /[ABCDEFGHIJKLMNOPQRSTUVWXYZ]/ ] to mean “any capital letter”). In cases where there is a well-defined sequence associated with a set of characters, the brackets can be used with the dash (-) to specify any one character in a range. The pattern [ /[2-5]/ ] specifies any one of the characters 2, 3, 4, or 5. The pattern [ /[b-g]/ ] specifies one of the characters b, c, d, e, f, or g. "
nlp(text)

  beam_id = beam_token_id // vocab_size


[{'answer': 'any single digit',
  'question': 'What does the regular expression specify?'},
 {'answer': 'any capital letter',
  'question': 'What does the regular expression mean?'},
 {'answer': 'brackets',
  'question': 'What can be used with the dash (-) to specify any one character in a range?'},
 {'answer': 'The pattern',
  'question': 'What specifies any one of the characters 2, 3, 4, or 5?'},
 {'answer': 'b, c, d, e, f, or g',
  'question': 'What characters does the pattern specify?'}]

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import re

In [None]:
def prepare_input_for_QAgenerator(contexts):
    processed_contexts = []

    replacements = [
        (re.compile(r"(?:^|\b)fig\.(?:\b|$)"), "figure"),
        (re.compile(r"(?:^|\b)Fig\.(?:\b|$)"), "Figure"),
        (re.compile(r"(?<!\$)\b(\d+\.\d+)\b"), r"[ \1 ]"),
        (re.compile(r"^(\d\.)$"), r"[ \1 ]"),
        (re.compile(r"(s/[^/]+/[^/]+/)"), r"[ \1 ]"),
        (re.compile(r"(/[^/]+/)"), r"[ \1 ]"),
        (re.compile(r"•"), r"[ • ]"),
    ]

    for context in contexts:
        processed_sentences = []
        for sent in sent_tokenize(context):
            for pat, rep in replacements:
                #print(f"A {sent!r}")
                sent = pat.sub(rep, sent)
                #print(f"B {sent!r}")
            processed_sentences.append(sent)
        processed_contexts.append(" ".join(processed_sentences))

    # print(processed_contexts)
    print(len(contexts))
    print(len(processed_contexts))
    # print(processed_contexts[274])
    # print(f"{processed_contexts[0]!r}")

    return processed_contexts

In [None]:
processed_contexts = prepare_input_for_QAgenerator(contexts)
train_data = []
failed_contexts = []

for context in processed_contexts:
    try:
        context = {
            "context": context,
            "questions_and_answers": [
                {"answer_start": context.find(q_and_a["answer"]), **q_and_a}
                for q_and_a in nlp(context)
            ],
        }
    except ValueError:
        failed_contexts.append(context)
        continue
    train_data.append(context)


import json
with open('/content/drive/MyDrive/Colab Notebooks/Data/train_data.json', 'w') as f:
    json.dump(train_data, f)

with open('/content/drive/MyDrive/Colab Notebooks/Data/failed_contexts.json', 'w') as f:
    json.dump(failed_contexts, f)

print(f"train_data: {len(train_data)}")
print(f"failed contexts: {len(failed_contexts)}")

1201
1201


  beam_id = beam_token_id // vocab_size


train_data: 922
failed contexts: 279
