In [1]:
import os, sys

import numpy as np
import pandas as pd 
from datasets import load_dataset

import importlib
from tqdm import tqdm
from joblib import Parallel, delayed
from copy import copy

from transformers import (
    AutoConfig,
    AutoTokenizer,
    FlaxAutoModelForSequenceClassification,
    HfArgumentParser,
    PretrainedConfig,
    TrainingArguments,
    is_tensorboard_available,
)

from flax.training.common_utils import get_metrics, onehot, shard


data_root = "/kaggle/input/feedback-prize-effectiveness/"
train = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/train.csv")


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

def _prepare_training_data_helper(args, tokenizer, df, is_train):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        if is_train:
            filename = os.path.join(args.input, "train", idx + ".txt")
        else:
            filename = os.path.join(args.input, "test", idx + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            add_special_tokens=False,
            padding="max_length",
            truncation=True,
            max_length=1024 ##TODO: update max_length
        )
        input_ids = encoded_text["input_ids"]

        sample = {
            "discourse_id": row["discourse_id"],
            "input_ids": input_ids,
            # "discourse_text": discourse_text,
            # "essay_text": text,
            "attention_mask": encoded_text["attention_mask"],
        }

        if "token_type_ids" in encoded_text:
            sample["token_type_ids"] = encoded_text["token_type_ids"]

        try:
            label = row["discourse_effectiveness"]
            sample["labels"] = LABEL_MAPPING[label]
        except:
            sample["labels"] = 0
        

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, args, num_jobs, is_train):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)(args, tokenizer, df, is_train) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [3]:
sys.path.append("../configs")
cfg = copy(importlib.import_module("default_config").cfg)

# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
    cfg.model_name_or_path,
    num_labels=cfg.num_labels,
    #finetuning_task=data_args.task_name,
    #use_auth_token=True if cfg.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path,
    use_fast=not cfg.use_slow_tokenizer,
    #use_auth_token=True if cfg.use_auth_token else None,
)
model = FlaxAutoModelForSequenceClassification.from_pretrained(
    cfg.model_name_or_path,
    config=config,
    #use_auth_token=True if cfg.use_auth_token else None,
)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing FlaxBigBirdForSequenceClassification: {('cls', 'seq_relationship', 'bias'), ('cls', 'predictions', 'transform', 'dense', 'bias'), ('cls', 'seq_relationship', 'kernel'), ('cls', 'predictions', 'bias'), ('cls', 'predictions', 'transform', 'dense', 'kernel'), ('cls', 'predictions', 'transform', 'LayerNorm', 'scale'), ('cls', 'predictions', 'transform', 'LayerNorm', 'bias')}
- This IS expected if you are initializing FlaxBigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaxBigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequen

In [4]:
val_data = prepare_training_data(train.iloc[range(0, 100, 5)], tokenizer, cfg, num_jobs=96, is_train=True)


100%|██████████| 1/1 [00:00<00:00, 87.53it/s]
100%|██████████| 1/1 [00:00<00:00, 99.58it/s]
100%|██████████| 1/1 [00:00<00:00, 68.88it/s]
100%|██████████| 1/1 [00:00<00:00, 64.07it/s]
100%|██████████| 1/1 [00:00<00:00, 100.14it/s]
100%|██████████| 1/1 [00:00<00:00, 99.32it/s]
100%|██████████| 1/1 [00:00<00:00, 95.86it/s]
100%|██████████| 1/1 [00:00<00:00, 105.81it/s]
100%|██████████| 1/1 [00:00<00:00, 101.37it/s]
100%|██████████| 1/1 [00:00<00:00, 75.69it/s]
100%|██████████| 1/1 [00:00<00:00, 83.38it/s]
100%|██████████| 1/1 [00:00<00:00, 94.17it/s]
100%|██████████| 1/1 [00:00<00:00, 87.24it/s]
100%|██████████| 1/1 [00:00<00:00, 102.73it/s]
100%|██████████| 1/1 [00:00<00:00, 84.25it/s]
100%|██████████| 1/1 [00:00<00:00, 88.70it/s]
100%|██████████| 1/1 [00:00<00:00, 101.63it/s]
100%|██████████| 1/1 [00:00<00:00, 100.86it/s]
100%|██████████| 1/1 [00:00<00:00, 100.51it/s]
100%|██████████| 1/1 [00:00<00:00, 96.55it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/

In [4]:
np.array(val_data[0]['input_ids']).shape

NameError: name 'val_data' is not defined

In [6]:
import json
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
## stratified Kfold for train dataframe using discourse_type and discourse_effectiveness
for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_type"], train["discourse_effectiveness"])):

    train_temp = train.iloc[train_index]
    valid_temp = train.iloc[valid_index]

    train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=96, is_train=True)
    val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=96, is_train=True)

    df = pd.DataFrame.from_records(train_data)
    df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

    df = pd.DataFrame.from_records(val_data)
    df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)

    print("Fold:", fold)
    print("Train:", train_index)
    print("Valid:", valid_index)
    print("\n")
    break

100%|██████████| 307/307 [00:00<00:00, 579.38it/s]

100%|██████████| 307/307 [00:00<00:00, 466.08it/s]
100%|██████████| 307/307 [00:00<00:00, 575.03it/s]
 36%|███▌      | 109/307 [00:00<00:00, 545.23it/s]
100%|██████████| 307/307 [00:00<00:00, 514.95it/s]
100%|██████████| 307/307 [00:00<00:00, 558.82it/s]
100%|██████████| 307/307 [00:00<00:00, 481.35it/s]
100%|██████████| 307/307 [00:00<00:00, 521.07it/s]
100%|██████████| 307/307 [00:00<00:00, 522.42it/s]
100%|██████████| 307/307 [00:00<00:00, 526.79it/s]
100%|██████████| 307/307 [00:00<00:00, 557.86it/s]
100%|██████████| 307/307 [00:00<00:00, 524.05it/s]
100%|██████████| 307/307 [00:00<00:00, 479.98it/s]
100%|██████████| 307/307 [00:00<00:00, 519.72it/s]
100%|██████████| 307/307 [00:00<00:00, 467.63it/s]
100%|██████████| 307/307 [00:00<00:00, 493.20it/s]
100%|██████████| 307/307 [00:00<00:00, 526.79it/s]
100%|██████████| 307/307 [00:00<00:00, 468.95it/s]
100%|██████████| 307/307 [00:00<00:00, 539.26it/s]
100%|██████████| 307/307 [00:0

Fold: 0
Train: [    0     1     4 ... 36762 36763 36764]
Valid: [    2     3     7 ... 36733 36736 36746]




In [6]:
## generate test dataset
test = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/test.csv")
test_data = prepare_training_data(test, tokenizer, cfg, num_jobs=96, is_train=False)
df = pd.DataFrame.from_records(test_data)
df.to_json(f"/kaggle/working/folds/test.jsonl", orient="records", lines=True)

100%|██████████| 1/1 [00:00<00:00, 80.30it/s]
100%|██████████| 1/1 [00:00<00:00, 85.43it/s]
100%|██████████| 1/1 [00:00<00:00, 89.66it/s]
100%|██████████| 1/1 [00:00<00:00, 86.40it/s]
100%|██████████| 1/1 [00:00<00:00, 85.60it/s]
100%|██████████| 1/1 [00:00<00:00, 83.67it/s]
100%|██████████| 1/1 [00:00<00:00, 84.25it/s]
100%|██████████| 1/1 [00:00<00:00, 84.78it/s]
100%|██████████| 1/1 [00:00<00:00, 89.48it/s]
100%|██████████| 1/1 [00:00<00:00, 85.13it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:

In [44]:
# import json
# from sklearn.model_selection import StratifiedKFold

# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# ## stratified Kfold for train dataframe using discourse_type and discourse_effectiveness
# for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_type"], train["discourse_effectiveness"])):

#     train_temp = train.iloc[train_index]
#     valid_temp = train.iloc[valid_index]

#     train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=32, is_train=True)
#     val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=32, is_train=True)

#     df = pd.DataFrame.from_records(train_data)
#     df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

#     df = pd.DataFrame.from_records(val_data)
#     df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)

#     print("Fold:", fold)
#     print("Train:", train_index)
#     print("Valid:", valid_index)
#     print("\n")

100%|██████████| 920/920 [00:01<00:00, 527.15it/s]
100%|██████████| 920/920 [00:01<00:00, 535.80it/s]
100%|██████████| 920/920 [00:01<00:00, 541.70it/s]
100%|██████████| 920/920 [00:01<00:00, 554.34it/s]
100%|██████████| 919/919 [00:01<00:00, 517.09it/s]
100%|██████████| 919/919 [00:01<00:00, 504.91it/s]
100%|██████████| 919/919 [00:01<00:00, 501.29it/s]
100%|██████████| 919/919 [00:01<00:00, 505.08it/s]
100%|██████████| 919/919 [00:01<00:00, 495.78it/s]
100%|██████████| 919/919 [00:01<00:00, 527.82it/s]
100%|██████████| 919/919 [00:01<00:00, 576.84it/s]
100%|██████████| 919/919 [00:01<00:00, 552.58it/s]
100%|██████████| 919/919 [00:01<00:00, 623.94it/s]
100%|██████████| 919/919 [00:01<00:00, 539.16it/s]
100%|██████████| 919/919 [00:01<00:00, 582.45it/s]
100%|██████████| 919/919 [00:01<00:00, 558.21it/s]
100%|██████████| 919/919 [00:01<00:00, 638.66it/s]
100%|██████████| 919/919 [00:01<00:00, 562.49it/s]
100%|██████████| 919/919 [00:01<00:00, 582.94it/s]
100%|██████████| 919/919 [00:01

Fold: 0
Train: [    0     1     4 ... 36762 36763 36764]
Valid: [    2     3     7 ... 36733 36736 36746]




100%|██████████| 920/920 [00:01<00:00, 517.61it/s]
100%|██████████| 920/920 [00:01<00:00, 534.69it/s]
100%|██████████| 920/920 [00:01<00:00, 548.40it/s]
100%|██████████| 920/920 [00:01<00:00, 558.75it/s]
100%|██████████| 919/919 [00:01<00:00, 514.06it/s]
100%|██████████| 919/919 [00:01<00:00, 537.65it/s]
100%|██████████| 919/919 [00:01<00:00, 497.47it/s]
 10%|█         | 94/919 [00:00<00:01, 483.61it/s]]

In [12]:
## data collator with dynamic padding
# def train_data_collator(rng:)
import jax
import datasets
from typing import Any, Callable, Dict, Optional, Tuple

rng = jax.random.PRNGKey(1)#cfg.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())

Array = Any
Dataset = datasets.arrow_dataset.Dataset
PRNGKey = Any

In [13]:
def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
    """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
    steps_per_epoch = len(dataset) // batch_size
    perms = jax.random.permutation(rng, len(dataset))
    perms = perms[: steps_per_epoch * batch_size]  # Skip incomplete batch.
    perms = perms.reshape((steps_per_epoch, batch_size))

    for perm in perms:
        batch = dataset[perm]
        discourse_id, input_ids, labels = dataset[perm]['discourse_id'], dataset[perm]['input_ids'], dataset[perm]['label']
        batch.pop("discourse_id", None)
        batch = {"input_ids": np.array(input_ids), "mask": [np.ones_like(x) for x in input_ids], "label": np.array(labels)}

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in batch["input_ids"]])
        # add padding
        if tokenizer.padding_side == "right":
            batch["input_ids"] = [s + (batch_max - len(s)) * [tokenizer.pad_token_id] for s in batch["input_ids"]]
        else:
            batch["input_ids"] = [(batch_max - len(s)) * [tokenizer.pad_token_id] + s for s in batch["input_ids"]]

        batch['input_ids'] = np.stack(batch['input_ids'])
        
        masks = np.zeros_like(batch['input_ids'])
        masks[batch['input_ids'] != tokenizer.pad_token_id] = 1
        batch['mask'] = masks

        batch = {k: np.array(v) for k, v in batch.items()}
        batch = shard(batch)
        yield batch

In [14]:
train_dataset = load_dataset("json", data_files="/kaggle/working/folds/valid_0.jsonl", split="train")
train_loader = train_data_collator(rng, train_dataset, cfg.per_device_train_batch_size)



In [18]:
for batch in train_loader:
    print(batch)
    batch
    break

In [20]:
## decode text using tokenizer
text = train_dataset[0]['input_ids']
text = tokenizer.decode(text)
print(text)

Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I'll be talking about how I think that is is a natural landform<unk>I think that the face is a natural landform because there is no life on Mars that we have descovered yet. If life was on Mars, we would know by now. The reason why I think it is a natural landform because, nobody live on Mars in order to create the figure. It says in paragraph 9, "It's not easy to target Cydonia," in which he is saying that its not easy to know if it is a natural landform at this point. In all that they're saying, its probably a natural landform.<u