In [1]:
import os, sys

import numpy as np
import pandas as pd 
from datasets import load_dataset

import importlib
from tqdm import tqdm
from joblib import Parallel, delayed
from copy import copy

from transformers import (
    AutoConfig,
    AutoTokenizer,
    FlaxAutoModelForSequenceClassification,
    HfArgumentParser,
    PretrainedConfig,
    TrainingArguments,
    is_tensorboard_available,
)

from flax.training.common_utils import get_metrics, onehot, shard


data_root = "/kaggle/input/feedback-prize-effectiveness/"
train = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/train.csv")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

def _prepare_training_data_helper(args, tokenizer, df, is_train):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        if is_train:
            filename = os.path.join(args.input, "train", idx + ".txt")
        else:
            filename = os.path.join(args.input, "test", idx + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            add_special_tokens=False,
            padding="max_length",
            truncation=True,
            max_length=1024 ##TODO: update max_length
        )
        input_ids = encoded_text["input_ids"]

        sample = {
            # "discourse_id": row["discourse_id"],
            "input_ids": input_ids,
            # "discourse_text": discourse_text,
            # "essay_text": text,
            "attention_mask": encoded_text["attention_mask"],
        }

        if "token_type_ids" in encoded_text:
            sample["token_type_ids"] = encoded_text["token_type_ids"]

        try:
            label = row["discourse_effectiveness"]
            sample["labels"] = LABEL_MAPPING[label]
        except:
            sample["labels"] = 0
        

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, args, num_jobs, is_train):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)(args, tokenizer, df, is_train) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [3]:
sys.path.append("../configs")
cfg = copy(importlib.import_module("elu_config").cfg)

print(cfg.model_name_or_path)

google/bigbird-roberta-large


In [4]:


# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
    cfg.model_name_or_path,
    num_labels=cfg.num_labels,
    #finetuning_task=data_args.task_name,
    #use_auth_token=True if cfg.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path,
    use_fast=not cfg.use_slow_tokenizer,
    #use_auth_token=True if cfg.use_auth_token else None,
)
model = FlaxAutoModelForSequenceClassification.from_pretrained(
    # cfg.model_name_or_path,
    cfg.model_name_or_path,
    config=config,
    ignore_mismatched_sizes=True,
    #use_auth_token=True if cfg.use_auth_token else None,
)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing FlaxBigBirdForSequenceClassification: {('cls', 'predictions', 'transform', 'dense', 'bias'), ('cls', 'predictions', 'transform', 'LayerNorm', 'scale'), ('cls', 'seq_relationship', 'kernel'), ('cls', 'predictions', 'transform', 'LayerNorm', 'bias'), ('cls', 'predictions', 'bias'), ('cls', 'seq_relationship', 'bias'), ('cls', 'predictions', 'transform', 'dense', 'kernel')}
- This IS expected if you are initializing FlaxBigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaxBigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSeque

In [5]:
cfg.model_name_or_path

'google/bigbird-roberta-large'

In [6]:
import json
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
## stratified Kfold for train dataframe using discourse_type and discourse_effectiveness
for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_type"], train["discourse_effectiveness"])):

    train_temp = train.iloc[train_index]
    valid_temp = train.iloc[valid_index]

    train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=96, is_train=True)
    val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=96, is_train=True)

    df = pd.DataFrame.from_records(train_data)
    df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

    df = pd.DataFrame.from_records(val_data)
    df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)

    print("Fold:", fold)
    print("Train:", train_index)
    print("Valid:", valid_index)
    print("\n")

100%|██████████| 307/307 [00:00<00:00, 509.80it/s]
100%|██████████| 307/307 [00:00<00:00, 584.60it/s]
100%|██████████| 307/307 [00:00<00:00, 590.48it/s]
100%|██████████| 307/307 [00:00<00:00, 464.20it/s]
100%|██████████| 307/307 [00:00<00:00, 441.26it/s]
100%|██████████| 307/307 [00:00<00:00, 498.70it/s]
100%|██████████| 307/307 [00:00<00:00, 556.65it/s]
100%|██████████| 307/307 [00:00<00:00, 513.49it/s]
100%|██████████| 307/307 [00:00<00:00, 518.06it/s]
 52%|█████▏    | 159/307 [00:00<00:00, 498.97it/s]
100%|██████████| 307/307 [00:00<00:00, 527.39it/s]
100%|██████████| 307/307 [00:00<00:00, 556.97it/s]
100%|██████████| 307/307 [00:00<00:00, 507.67it/s]
100%|██████████| 307/307 [00:00<00:00, 498.69it/s]
100%|██████████| 307/307 [00:00<00:00, 499.86it/s]
100%|██████████| 307/307 [00:00<00:00, 481.97it/s]
100%|██████████| 307/307 [00:00<00:00, 511.85it/s]
100%|██████████| 307/307 [00:00<00:00, 524.01it/s]
100%|██████████| 307/307 [00:00<00:00, 495.55it/s]
100%|██████████| 307/307 [00:00

Fold: 0
Train: [    0     1     4 ... 36762 36763 36764]
Valid: [    2     3     7 ... 36733 36736 36746]




100%|██████████| 307/307 [00:00<00:00, 479.12it/s]
100%|██████████| 307/307 [00:00<00:00, 583.94it/s]
100%|██████████| 307/307 [00:00<00:00, 477.10it/s]
100%|██████████| 307/307 [00:00<00:00, 616.63it/s]
100%|██████████| 307/307 [00:00<00:00, 453.90it/s]
100%|██████████| 307/307 [00:00<00:00, 540.93it/s]
100%|██████████| 307/307 [00:00<00:00, 580.97it/s]
100%|██████████| 307/307 [00:00<00:00, 485.55it/s]
100%|██████████| 307/307 [00:00<00:00, 531.10it/s]
100%|██████████| 307/307 [00:00<00:00, 509.53it/s]
100%|██████████| 307/307 [00:00<00:00, 535.10it/s]
100%|██████████| 307/307 [00:00<00:00, 556.86it/s]
100%|██████████| 307/307 [00:00<00:00, 503.76it/s]
100%|██████████| 307/307 [00:00<00:00, 488.74it/s]
100%|██████████| 307/307 [00:00<00:00, 489.32it/s]
100%|██████████| 307/307 [00:00<00:00, 472.37it/s]
100%|██████████| 307/307 [00:00<00:00, 523.07it/s]
100%|██████████| 307/307 [00:00<00:00, 485.37it/s]
100%|██████████| 307/307 [00:00<00:00, 541.40it/s]
100%|██████████| 307/307 [00:00

Fold: 1
Train: [    0     2     3 ... 36759 36763 36764]
Valid: [    1     4     8 ... 36760 36761 36762]




100%|██████████| 307/307 [00:00<00:00, 419.59it/s]
100%|██████████| 307/307 [00:00<00:00, 546.78it/s]
 93%|█████████▎| 287/307 [00:00<00:00, 492.78it/s]
100%|██████████| 307/307 [00:00<00:00, 445.45it/s]
 19%|█▊        | 57/307 [00:00<00:00, 564.76it/s]]
100%|██████████| 307/307 [00:00<00:00, 452.44it/s]
100%|██████████| 307/307 [00:00<00:00, 575.92it/s]
100%|██████████| 307/307 [00:00<00:00, 488.16it/s]
100%|██████████| 307/307 [00:00<00:00, 525.34it/s]
100%|██████████| 307/307 [00:00<00:00, 525.08it/s]
100%|██████████| 307/307 [00:00<00:00, 489.50it/s]
 14%|█▍        | 44/307 [00:00<00:00, 439.09it/s]]
100%|██████████| 307/307 [00:00<00:00, 514.28it/s]
100%|██████████| 307/307 [00:00<00:00, 500.10it/s]
100%|██████████| 307/307 [00:00<00:00, 496.33it/s]
100%|██████████| 307/307 [00:00<00:00, 449.63it/s]
100%|██████████| 307/307 [00:00<00:00, 529.84it/s]
100%|██████████| 307/307 [00:00<00:00, 478.34it/s]
100%|██████████| 307/307 [00:00<00:00, 481.55it/s]
100%|██████████| 307/307 [00:00

Fold: 2
Train: [    0     1     2 ... 36761 36762 36763]
Valid: [    5     6     9 ... 36755 36759 36764]




100%|██████████| 307/307 [00:00<00:00, 489.77it/s]
100%|██████████| 307/307 [00:00<00:00, 582.05it/s]
100%|██████████| 307/307 [00:00<00:00, 590.56it/s]
100%|██████████| 307/307 [00:00<00:00, 428.06it/s]
100%|██████████| 307/307 [00:00<00:00, 480.85it/s]
100%|██████████| 307/307 [00:00<00:00, 488.29it/s]
100%|██████████| 307/307 [00:00<00:00, 537.34it/s]
100%|██████████| 307/307 [00:00<00:00, 457.35it/s]
100%|██████████| 307/307 [00:00<00:00, 547.05it/s]
100%|██████████| 307/307 [00:00<00:00, 518.40it/s]
100%|██████████| 307/307 [00:00<00:00, 507.02it/s]
100%|██████████| 307/307 [00:00<00:00, 556.26it/s]
100%|██████████| 307/307 [00:00<00:00, 465.37it/s]
100%|██████████| 307/307 [00:00<00:00, 516.92it/s]
100%|██████████| 307/307 [00:00<00:00, 500.27it/s]
100%|██████████| 307/307 [00:00<00:00, 434.75it/s]
100%|██████████| 307/307 [00:00<00:00, 523.61it/s]
 17%|█▋        | 51/307 [00:00<00:00, 508.22it/s]]
100%|██████████| 307/307 [00:00<00:00, 493.68it/s]
100%|██████████| 307/307 [00:00

Fold: 3
Train: [    0     1     2 ... 36761 36762 36764]
Valid: [   10    22    23 ... 36745 36756 36763]




100%|██████████| 307/307 [00:00<00:00, 445.40it/s]
100%|██████████| 307/307 [00:00<00:00, 555.69it/s]
100%|██████████| 307/307 [00:00<00:00, 574.22it/s]
100%|██████████| 307/307 [00:00<00:00, 421.20it/s]
100%|██████████| 307/307 [00:00<00:00, 480.40it/s]
100%|██████████| 307/307 [00:00<00:00, 454.02it/s]
100%|██████████| 307/307 [00:00<00:00, 543.93it/s]
100%|██████████| 307/307 [00:00<00:00, 492.63it/s]
100%|██████████| 307/307 [00:00<00:00, 534.32it/s]
100%|██████████| 307/307 [00:00<00:00, 497.06it/s]
100%|██████████| 307/307 [00:00<00:00, 560.08it/s]
100%|██████████| 307/307 [00:00<00:00, 550.28it/s]
100%|██████████| 307/307 [00:00<00:00, 471.97it/s]
100%|██████████| 307/307 [00:00<00:00, 512.87it/s]
100%|██████████| 307/307 [00:00<00:00, 496.92it/s]
100%|██████████| 307/307 [00:00<00:00, 454.36it/s]
100%|██████████| 307/307 [00:00<00:00, 505.86it/s]
100%|██████████| 307/307 [00:00<00:00, 468.38it/s]
100%|██████████| 307/307 [00:00<00:00, 499.19it/s]
100%|██████████| 307/307 [00:00

Fold: 4
Train: [    1     2     3 ... 36762 36763 36764]
Valid: [    0    13    14 ... 36748 36753 36757]




In [7]:
## generate test dataset
test = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/test.csv")
test_data = prepare_training_data(test, tokenizer, cfg, num_jobs=96, is_train=False)
df = pd.DataFrame.from_records(test_data)
df.to_json(f"/kaggle/working/folds/test.jsonl", orient="records", lines=True)

100%|██████████| 1/1 [00:00<00:00, 91.62it/s]
100%|██████████| 1/1 [00:00<00:00, 107.22it/s]
100%|██████████| 1/1 [00:00<00:00, 106.50it/s]
100%|██████████| 1/1 [00:00<00:00, 110.04it/s]
100%|██████████| 1/1 [00:00<00:00, 109.92it/s]
100%|██████████| 1/1 [00:00<00:00, 101.18it/s]
100%|██████████| 1/1 [00:00<00:00, 100.80it/s]
100%|██████████| 1/1 [00:00<00:00, 111.45it/s]
100%|██████████| 1/1 [00:00<00:00, 106.06it/s]
100%|██████████| 1/1 [00:00<00:00, 109.51it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]https://symbolize.stripped_domain/r/?trace=7ff144ae69ab,7ff144a110bf,7fe47842b9bf,7fe4788159bf,7ff1449c92df,7fe204f5a9bf,7fe1fec1e9bf,7fe1fee309bf,7fe1ff0429bf,7fe1ff2549bf,7fe1ff4669bf,7fe1ff6789bf,7fe1ff88a9bf,7fe2031219bf,7fe2033339bf,7fe203545

KeyboardInterrupt: 

In [8]:
print("AFSAS")

AFSAS


In [11]:
## data collator with dynamic padding
# def train_data_collator(rng:)
import jax
import datasets
from typing import Any, Callable, Dict, Optional, Tuple

rng = jax.random.PRNGKey(1)#cfg.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())

Array = Any
Dataset = datasets.arrow_dataset.Dataset
PRNGKey = Any

In [18]:
def eval_data_collator(dataset: Dataset, batch_size: int):
    """Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
    for i in range(len(dataset) // batch_size):
        batch = dataset[i * batch_size : (i + 1) * batch_size]
        batch = {k: np.array(v) for k, v in batch.items()}

        yield batch

In [19]:
rng = jax.random.PRNGKey(cfg.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())
rng, input_rng = jax.random.split(rng)
eval_dataset   = load_dataset("json", data_files=f"/kaggle/working/folds/valid_{cfg.fold}.jsonl", split="train")
eval_loader = eval_data_collator(eval_dataset, 4)




In [20]:
for batch in eval_loader:
    print(batch['input_ids'].shape)    
    break

(4, 512)


In [22]:
labels = batch.pop("labels")
outs = model(**batch)

In [27]:
outs.logits, labels


(DeviceArray([[-0.5264392 ,  2.8834088 , -2.3014016 ],
              [ 0.8853584 ,  2.7341392 , -3.3088202 ],
              [ 0.59324175,  2.7294536 , -3.0007594 ],
              [-3.8339581 ,  0.4319707 ,  3.4270277 ]], dtype=float32),
 array([1, 1, 1, 2]))

In [13]:
## decode text using tokenizer
text = train_dataset[2]['input_ids']
text = tokenizer.decode(text)
print(text)

Counterclaim Everyone who thought it was made by alieans even though it wasn't, was not satisfied. I think they were not satisfied because they have thought since 1976 that it was really formed by alieans.  Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I'll be talking about how I think that is is a natural landform<unk>I think that the face is a natural landform because there is no life on Mars that we have descovered yet. If life was on Mars, we would know by now. The reason why I think it is a natural landform because, nobody live on Mars in order to create the figure. It sa

In [12]:
len(np.nonzero(train_dataset[0]['input_ids'])[0])

512

In [14]:
train_dataset[0]['input_ids']

[22171,
 415,
 993,
 427,
 363,
 2087,
 419,
 358,
 3389,
 2057,
 788,
 881,
 713,
 419,
 746,
 1305,
 420,
 8807,
 427,
 457,
 524,
 849,
 32212,
 1966,
 321,
 16003,
 112,
 1413,
 1202,
 19169,
 112,
 1413,
 1202,
 1117,
 385,
 408,
 3698,
 647,
 804,
 529,
 2087,
 420,
 8807,
 419,
 358,
 3389,
 2057,
 788,
 494,
 712,
 713,
 419,
 1305,
 420,
 8807,
 427,
 1026,
 441,
 114,
 484,
 1722,
 419,
 647,
 804,
 8985,
 1819,
 358,
 4387,
 387,
 8807,
 391,
 358,
 2087,
 474,
 1876,
 420,
 363,
 5541,
 114,
 8985,
 1696,
 571,
 861,
 712,
 363,
 2057,
 788,
 474,
 2828,
 517,
 1305,
 420,
 8807,
 112,
 494,
 712,
 441,
 419,
 756,
 358,
 3389,
 2057,
 788,
 114,
 1651,
 717,
 6751,
 112,
 415,
 993,
 427,
 363,
 2087,
 419,
 358,
 3389,
 2057,
 788,
 881,
 415,
 17767,
 993,
 427,
 713,
 419,
 698,
 1305,
 420,
 8807,
 114,
 655,
 878,
 1407,
 1279,
 23650,
 112,
 415,
 1284,
 408,
 3476,
 647,
 804,
 415,
 993,
 427,
 419,
 419,
 358,
 3389,
 2057,
 788,
 100,
 141,
 993,
 427,
 363,
 208