# Install libraries

In [44]:
!pip install --quiet json_lines
!pip install --quiet transformers
!pip install --quiet lineflow

# Albert requires SentencePiece
!pip install --quiet SentencePiece

[K     |████████████████████████████████| 2.3MB 7.9MB/s 
[K     |████████████████████████████████| 901kB 37.4MB/s 
[K     |████████████████████████████████| 3.3MB 36.7MB/s 
[?25h

# Import libraries

In [47]:
from typing import Dict
import csv
import lineflow as lf
import json_lines
from functools import partial
import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from transformers import AlbertForMultipleChoice, AlbertTokenizer, AdamW

# Download train, valid and test files

## Download `train.csv` and `valid.csv`

In [30]:
!ls
!wget https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/valid.csv
!wget https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/train.csv

sample_data  sample_prediction.csv  test.jsonl	valid.csv
--2021-06-01 21:18:24--  https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16660449 (16M) [text/plain]
Saving to: ‘train.csv’


2021-06-01 21:18:27 (40.4 MB/s) - ‘train.csv’ saved [16660449/16660449]



## Create `val_dataset` and `train_dataset`

In [34]:
val_datas = []
with open('valid.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for sample in reader:
      _id = sample[0]
      _article = sample[1]
      _question = sample[2]
      _options = []
      _answer = sample[-1]
      _options.append(sample[3])
      _options.append(sample[4])
      _options.append(sample[5])
      _options.append(sample[6])

      data = {
              "id": _id,
              "article": _article,
              "options": _options,
              "question": _question,
              "answer": _answer
              }
      val_datas.append(data)
val_dataset = lf.Dataset(val_datas)

In [40]:
train_datas = []
with open('train.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for sample in reader:
      _id = sample[0]
      _article = sample[1]
      _question = sample[2]
      _options = []
      _answer = sample[-1]
      _options.append(sample[3])
      _options.append(sample[4])
      _options.append(sample[5])
      _options.append(sample[6])

      data = {
              "id": _id,
              "article": _article,
              "options": _options,
              "question": _question,
              "answer": _answer
              }
      train_datas.append(data)
train_dataset = lf.Dataset(train_datas)

## Download `test.jsonl`

In [10]:
!wget https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/test.jsonl
!ls

--2021-06-01 20:50:31--  https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/test.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5610681 (5.4M) [text/plain]
Saving to: ‘test.jsonl’


2021-06-01 20:50:32 (39.3 MB/s) - ‘test.jsonl’ saved [5610681/5610681]

sample_data  test.jsonl  valid.csv


## Download labels file `sample_prediction` of `test.jsonl`

In [18]:
!wget https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/sample_prediction.csv

--2021-06-01 20:54:31--  https://raw.githubusercontent.com/wilburOne/cosmosqa/master/data/sample_prediction.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932788 (911K) [text/plain]
Saving to: ‘sample_prediction.csv’


2021-06-01 20:54:32 (14.9 MB/s) - ‘sample_prediction.csv’ saved [932788/932788]



## Generate `test_dataset`

In [36]:
test_datas = []
labels = []
with open('sample_prediction.csv', 'r') as f:
  test_samples_labels = csv.reader(f)
  next(test_samples_labels)
  for label in test_samples_labels:
    labels.append(label)

with open("test.jsonl") as f:
  index = 0
  for sample in json_lines.reader(f):
    if not sample['id'] == labels[index][0]:
      print("not equal")
    _id = sample['id']
    _article = sample['context']
    _question = sample['question']
    _options = []
    _answer = str(labels[index][1])
    _options.append(sample['answer0'])
    _options.append(sample['answer1'])
    _options.append(sample['answer2'])
    _options.append(sample['answer3'])

    test_data = {
            "id": _id,
            "article": _article,
            "options": _options,
            "question": _question,
            "answer": _answer
            }
    test_datas.append(test_data)
    index += 1
test_dataset = lf.Dataset(test_datas)

# Define constant variables

In [38]:
MAX_LEN = 256
NUM_LABELS = 4
label_map = {"A": 0, "B": 1, "C": 2, "D": 3}
BATCH_SIZE = 8

# Define function to generate `cache` files

In [51]:
def preprocess(tokenizer: AlbertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = int(x["answer"])
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }


def get_dataloader(tokenizer):
    preprocessor = partial(preprocess, tokenizer)

    train_dataloader = DataLoader(
            train_dataset.map(preprocessor).save("train_cosmos.cache"),
            sampler=RandomSampler(train_dataset),
            batch_size=BATCH_SIZE
            )

    val_dataloader = DataLoader(
            val_dataset.map(preprocessor).save("val_cosmos.cache"),
            sampler=SequentialSampler(val_dataset),
            batch_size=BATCH_SIZE
            )

    test_dataloader = DataLoader(
            test_dataset.map(preprocessor).save("test_cosmos.cache"),
            sampler=SequentialSampler(test_dataset),
            batch_size=BATCH_SIZE
            )

    return train_dataloader, val_dataloader, test_dataloader

# Create `tokenizer`

In [49]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2", do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1312669.0, style=ProgressStyle(descript…




## Use function to get `train_dataloader`,  `val_dataloader` and `test_dataloader`

In [52]:
train_dataloader, val_dataloader, test_dataloader = get_dataloader(tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Saving data to train_cosmos.cache...
Saving data to val_cosmos.cache...
Saving data to test_cosmos.cache...


# Copy `cache` files into `Google Drive`

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [55]:
!cp train_cosmos.cache /content/drive/My\ Drive/Cosmos/AlbertCache
!cp val_cosmos.cache /content/drive/My\ Drive/Cosmos/AlbertCache
!cp test_cosmos.cache /content/drive/My\ Drive/Cosmos/AlbertCache

In [56]:
print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

3158
374
871
