In [1]:
import os, sys

import numpy as np
import pandas as pd 
from datasets import load_dataset

import importlib
from tqdm import tqdm
from joblib import Parallel, delayed
from copy import copy

from transformers import (
    AutoConfig,
    AutoTokenizer,
    FlaxAutoModelForSequenceClassification,
    HfArgumentParser,
    PretrainedConfig,
    TrainingArguments,
    is_tensorboard_available,
)

data_root = "/kaggle/input/feedback-prize-effectiveness/"
train = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/train.csv")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

def _prepare_training_data_helper(args, tokenizer, df, is_train):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        if is_train:
            filename = os.path.join(args.input, "train", idx + ".txt")
        else:
            filename = os.path.join(args.input, "test", idx + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            add_special_tokens=False,
        )
        input_ids = encoded_text["input_ids"]

        sample = {
            "discourse_id": row["discourse_id"],
            "input_ids": input_ids,
            # "discourse_text": discourse_text,
            # "essay_text": text,
            # "mask": encoded_text["attention_mask"],
        }

        if "token_type_ids" in encoded_text:
            sample["token_type_ids"] = encoded_text["token_type_ids"]

        label = row["discourse_effectiveness"]

        sample["label"] = LABEL_MAPPING[label]

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, args, num_jobs, is_train):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)(args, tokenizer, df, is_train) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [3]:
sys.path.append("../configs")
cfg = copy(importlib.import_module("default_config").cfg)

# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
    cfg.model_name_or_path,
    num_labels=cfg.num_labels,
    #finetuning_task=data_args.task_name,
    #use_auth_token=True if cfg.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path,
    use_fast=not cfg.use_slow_tokenizer,
    #use_auth_token=True if cfg.use_auth_token else None,
)
model = FlaxAutoModelForSequenceClassification.from_pretrained(
    cfg.model_name_or_path,
    config=config,
    #use_auth_token=True if cfg.use_auth_token else None,
)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Downloading: 100%|██████████| 489M/489M [00:12<00:00, 41.1MB/s] 
Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing FlaxBigBirdForSequenceClassification: {('cls', 'predictions', 'transform', 'LayerNorm', 'scale'), ('cls', 'predictions', 'transform', 'dense', 'kernel'), ('cls', 'predictions', 'bias'), ('cls', 'seq_relationship', 'kernel'), ('cls', 'seq_relationship', 'bias'), ('cls', 'predictions', 'transform', 'dense', 'bias'), ('cls', 'predictions', 'transform', 'LayerNorm', 'bias')}
- This IS expected if you are initializing FlaxBigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaxBigBirdForSequenceClassification from the checkpoint of a model that

In [4]:
import json
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
## stratified Kfold for train dataframe using discourse_type and discourse_effectiveness
for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_type"], train["discourse_effectiveness"])):

    train_temp = train.iloc[train_index]
    valid_temp = train.iloc[valid_index]

    train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=32, is_train=True)
    val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=32, is_train=True)

    df = pd.DataFrame.from_records(train_data)
    df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

    df = pd.DataFrame.from_records(val_data)
    df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)

    print("Fold:", fold)
    print("Train:", train_index)
    print("Valid:", valid_index)
    print("\n")

100%|██████████| 920/920 [00:01<00:00, 545.72it/s]
100%|██████████| 920/920 [00:01<00:00, 527.94it/s]
100%|██████████| 920/920 [00:01<00:00, 541.26it/s]
100%|██████████| 920/920 [00:01<00:00, 546.11it/s]
100%|██████████| 919/919 [00:01<00:00, 528.25it/s]
100%|██████████| 919/919 [00:01<00:00, 516.10it/s]
100%|██████████| 919/919 [00:01<00:00, 531.19it/s]
100%|██████████| 919/919 [00:01<00:00, 519.64it/s]
100%|██████████| 919/919 [00:01<00:00, 586.67it/s]
100%|██████████| 919/919 [00:01<00:00, 501.12it/s]
100%|██████████| 919/919 [00:01<00:00, 570.37it/s]
100%|██████████| 919/919 [00:01<00:00, 554.05it/s]
100%|██████████| 919/919 [00:01<00:00, 632.82it/s]
100%|██████████| 919/919 [00:01<00:00, 540.17it/s]
100%|██████████| 919/919 [00:01<00:00, 621.42it/s]
100%|██████████| 919/919 [00:01<00:00, 587.85it/s]
100%|██████████| 919/919 [00:01<00:00, 564.17it/s]
100%|██████████| 919/919 [00:01<00:00, 626.40it/s]
100%|██████████| 919/919 [00:01<00:00, 629.24it/s]
100%|██████████| 919/919 [00:01

Fold: 0
Train: [    0     1     4 ... 36762 36763 36764]
Valid: [    2     3     7 ... 36733 36736 36746]




100%|██████████| 920/920 [00:01<00:00, 557.88it/s]
100%|██████████| 920/920 [00:01<00:00, 547.42it/s]
100%|██████████| 920/920 [00:01<00:00, 577.22it/s]
100%|██████████| 920/920 [00:01<00:00, 575.97it/s]
100%|██████████| 919/919 [00:01<00:00, 510.71it/s]
100%|██████████| 919/919 [00:01<00:00, 570.44it/s]
100%|██████████| 919/919 [00:01<00:00, 489.40it/s]
100%|██████████| 919/919 [00:01<00:00, 496.32it/s]
100%|██████████| 919/919 [00:01<00:00, 572.91it/s]
100%|██████████| 919/919 [00:01<00:00, 497.88it/s]
100%|██████████| 919/919 [00:01<00:00, 567.60it/s]
100%|██████████| 919/919 [00:01<00:00, 565.65it/s]
100%|██████████| 919/919 [00:01<00:00, 577.61it/s]
100%|██████████| 919/919 [00:01<00:00, 640.96it/s]
100%|██████████| 919/919 [00:01<00:00, 639.00it/s]
100%|██████████| 919/919 [00:01<00:00, 581.07it/s]
100%|██████████| 919/919 [00:01<00:00, 597.09it/s]
100%|██████████| 919/919 [00:01<00:00, 641.81it/s]
100%|██████████| 919/919 [00:01<00:00, 598.42it/s]
100%|██████████| 919/919 [00:01

Fold: 1
Train: [    0     2     3 ... 36759 36763 36764]
Valid: [    1     4     8 ... 36760 36761 36762]




100%|██████████| 920/920 [00:01<00:00, 548.64it/s]
100%|██████████| 920/920 [00:01<00:00, 552.04it/s]
100%|██████████| 920/920 [00:01<00:00, 577.56it/s]
100%|██████████| 920/920 [00:01<00:00, 560.74it/s]
100%|██████████| 919/919 [00:01<00:00, 546.81it/s]
100%|██████████| 919/919 [00:01<00:00, 566.81it/s]
100%|██████████| 919/919 [00:01<00:00, 520.31it/s]
100%|██████████| 919/919 [00:01<00:00, 532.12it/s]
100%|██████████| 919/919 [00:01<00:00, 531.64it/s]
100%|██████████| 919/919 [00:01<00:00, 542.95it/s]
100%|██████████| 919/919 [00:01<00:00, 570.06it/s]
100%|██████████| 919/919 [00:01<00:00, 577.56it/s]
100%|██████████| 919/919 [00:01<00:00, 579.58it/s]
100%|██████████| 919/919 [00:01<00:00, 616.92it/s]
100%|██████████| 919/919 [00:01<00:00, 635.96it/s]
100%|██████████| 919/919 [00:01<00:00, 552.20it/s]
100%|██████████| 919/919 [00:01<00:00, 576.22it/s]
100%|██████████| 919/919 [00:01<00:00, 619.82it/s]
100%|██████████| 919/919 [00:01<00:00, 558.49it/s]
100%|██████████| 919/919 [00:01

Fold: 2
Train: [    0     1     2 ... 36761 36762 36763]
Valid: [    5     6     9 ... 36755 36759 36764]




100%|██████████| 920/920 [00:01<00:00, 557.43it/s]
100%|██████████| 920/920 [00:01<00:00, 548.37it/s]
100%|██████████| 920/920 [00:01<00:00, 585.19it/s]
100%|██████████| 920/920 [00:01<00:00, 540.60it/s]
100%|██████████| 919/919 [00:01<00:00, 531.22it/s]
100%|██████████| 919/919 [00:01<00:00, 530.64it/s]
100%|██████████| 919/919 [00:01<00:00, 550.80it/s]
100%|██████████| 919/919 [00:01<00:00, 537.99it/s]
100%|██████████| 919/919 [00:01<00:00, 522.98it/s]
100%|██████████| 919/919 [00:01<00:00, 533.57it/s]
100%|██████████| 919/919 [00:01<00:00, 563.44it/s]
100%|██████████| 919/919 [00:01<00:00, 542.01it/s]
100%|██████████| 919/919 [00:01<00:00, 540.43it/s]
100%|██████████| 919/919 [00:01<00:00, 569.78it/s]
100%|██████████| 919/919 [00:01<00:00, 624.61it/s]
100%|██████████| 919/919 [00:01<00:00, 578.51it/s]
100%|██████████| 919/919 [00:01<00:00, 556.63it/s]
100%|██████████| 919/919 [00:01<00:00, 580.90it/s]
100%|██████████| 919/919 [00:01<00:00, 562.24it/s]
100%|██████████| 919/919 [00:01

Fold: 3
Train: [    0     1     2 ... 36761 36762 36764]
Valid: [   10    22    23 ... 36745 36756 36763]




100%|██████████| 920/920 [00:01<00:00, 545.94it/s]
100%|██████████| 920/920 [00:01<00:00, 552.33it/s]
100%|██████████| 920/920 [00:01<00:00, 557.72it/s]
100%|██████████| 920/920 [00:01<00:00, 557.44it/s]
100%|██████████| 919/919 [00:01<00:00, 536.23it/s]
100%|██████████| 919/919 [00:01<00:00, 526.93it/s]
100%|██████████| 919/919 [00:01<00:00, 571.83it/s]
100%|██████████| 919/919 [00:01<00:00, 489.45it/s]
100%|██████████| 919/919 [00:01<00:00, 570.41it/s]
100%|██████████| 919/919 [00:01<00:00, 490.39it/s]
100%|██████████| 919/919 [00:01<00:00, 551.90it/s]
100%|██████████| 919/919 [00:01<00:00, 539.16it/s]
100%|██████████| 919/919 [00:01<00:00, 668.15it/s]
100%|██████████| 919/919 [00:01<00:00, 554.16it/s]
100%|██████████| 919/919 [00:01<00:00, 614.68it/s]
100%|██████████| 919/919 [00:01<00:00, 551.34it/s]
100%|██████████| 919/919 [00:01<00:00, 582.58it/s]
100%|██████████| 919/919 [00:01<00:00, 631.35it/s]
100%|██████████| 919/919 [00:01<00:00, 577.79it/s]
100%|██████████| 919/919 [00:01

Fold: 4
Train: [    1     2     3 ... 36762 36763 36764]
Valid: [    0    13    14 ... 36748 36753 36757]




In [5]:
import json
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
## stratified Kfold for train dataframe using discourse_type and discourse_effectiveness
for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_type"], train["discourse_effectiveness"])):

    train_temp = train.iloc[train_index]
    valid_temp = train.iloc[valid_index]

    train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=32, is_train=True)
    val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=32, is_train=True)

    df = pd.DataFrame.from_records(train_data)
    df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

    df = pd.DataFrame.from_records(val_data)
    df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)

    print("Fold:", fold)
    print("Train:", train_index)
    print("Valid:", valid_index)
    print("\n")

100%|██████████| 920/920 [00:01<00:00, 540.72it/s]
100%|██████████| 920/920 [00:01<00:00, 558.63it/s]
100%|██████████| 920/920 [00:01<00:00, 550.29it/s]
100%|██████████| 920/920 [00:01<00:00, 576.67it/s]
100%|██████████| 919/919 [00:01<00:00, 535.77it/s]
100%|██████████| 919/919 [00:01<00:00, 547.30it/s]
100%|██████████| 919/919 [00:01<00:00, 502.17it/s]
100%|██████████| 919/919 [00:01<00:00, 526.85it/s]
100%|██████████| 919/919 [00:01<00:00, 508.83it/s]
100%|██████████| 919/919 [00:01<00:00, 546.84it/s]
100%|██████████| 919/919 [00:01<00:00, 530.34it/s]
100%|██████████| 919/919 [00:01<00:00, 545.93it/s]
100%|██████████| 919/919 [00:01<00:00, 558.02it/s]
100%|██████████| 919/919 [00:01<00:00, 610.11it/s]
100%|██████████| 919/919 [00:01<00:00, 589.85it/s]
100%|██████████| 919/919 [00:01<00:00, 564.53it/s]
100%|██████████| 919/919 [00:01<00:00, 564.54it/s]
100%|██████████| 919/919 [00:01<00:00, 611.67it/s]
100%|██████████| 919/919 [00:01<00:00, 581.48it/s]
100%|██████████| 919/919 [00:01

Fold: 0
Train: [    0     1     4 ... 36762 36763 36764]
Valid: [    2     3     7 ... 36733 36736 36746]




100%|██████████| 920/920 [00:01<00:00, 527.65it/s]
100%|██████████| 920/920 [00:01<00:00, 555.28it/s]
100%|██████████| 920/920 [00:01<00:00, 560.19it/s]
100%|██████████| 920/920 [00:01<00:00, 573.02it/s]
100%|██████████| 919/919 [00:01<00:00, 514.23it/s]
100%|██████████| 919/919 [00:01<00:00, 559.05it/s]
100%|██████████| 919/919 [00:01<00:00, 514.33it/s]
100%|██████████| 919/919 [00:01<00:00, 524.01it/s]
100%|██████████| 919/919 [00:01<00:00, 572.21it/s]
100%|██████████| 919/919 [00:01<00:00, 509.77it/s]
100%|██████████| 919/919 [00:01<00:00, 544.59it/s]
100%|██████████| 919/919 [00:01<00:00, 545.07it/s]
100%|██████████| 919/919 [00:01<00:00, 638.80it/s]
100%|██████████| 919/919 [00:01<00:00, 574.35it/s]
100%|██████████| 919/919 [00:01<00:00, 597.46it/s]
100%|██████████| 919/919 [00:01<00:00, 581.42it/s]
100%|██████████| 919/919 [00:01<00:00, 529.57it/s]
100%|██████████| 919/919 [00:01<00:00, 617.81it/s]
100%|██████████| 919/919 [00:01<00:00, 578.36it/s]
100%|██████████| 919/919 [00:01

Fold: 1
Train: [    0     2     3 ... 36759 36763 36764]
Valid: [    1     4     8 ... 36760 36761 36762]




100%|██████████| 920/920 [00:01<00:00, 547.10it/s]
100%|██████████| 920/920 [00:01<00:00, 575.11it/s]
100%|██████████| 920/920 [00:01<00:00, 558.95it/s]
100%|██████████| 920/920 [00:01<00:00, 561.87it/s]
100%|██████████| 919/919 [00:01<00:00, 523.22it/s]
100%|██████████| 919/919 [00:01<00:00, 542.01it/s]
100%|██████████| 919/919 [00:01<00:00, 542.45it/s]
100%|██████████| 919/919 [00:01<00:00, 516.00it/s]
100%|██████████| 919/919 [00:01<00:00, 558.41it/s]
100%|██████████| 919/919 [00:01<00:00, 503.26it/s]
100%|██████████| 919/919 [00:01<00:00, 576.67it/s]
100%|██████████| 919/919 [00:01<00:00, 551.15it/s]
100%|██████████| 919/919 [00:01<00:00, 625.98it/s]
100%|██████████| 919/919 [00:01<00:00, 547.96it/s]
100%|██████████| 919/919 [00:01<00:00, 581.49it/s]
100%|██████████| 919/919 [00:01<00:00, 566.48it/s]
100%|██████████| 919/919 [00:01<00:00, 627.48it/s]
100%|██████████| 919/919 [00:01<00:00, 570.99it/s]
100%|██████████| 919/919 [00:01<00:00, 587.02it/s]
100%|██████████| 919/919 [00:01

Fold: 2
Train: [    0     1     2 ... 36761 36762 36763]
Valid: [    5     6     9 ... 36755 36759 36764]




100%|██████████| 920/920 [00:01<00:00, 546.53it/s]
100%|██████████| 920/920 [00:01<00:00, 548.51it/s]
100%|██████████| 920/920 [00:01<00:00, 573.24it/s]
100%|██████████| 920/920 [00:01<00:00, 583.09it/s]
100%|██████████| 919/919 [00:01<00:00, 522.10it/s]
100%|██████████| 919/919 [00:01<00:00, 542.94it/s]
100%|██████████| 919/919 [00:01<00:00, 499.13it/s]
100%|██████████| 919/919 [00:01<00:00, 531.80it/s]
100%|██████████| 919/919 [00:01<00:00, 527.94it/s]
100%|██████████| 919/919 [00:01<00:00, 541.60it/s]
100%|██████████| 919/919 [00:01<00:00, 557.53it/s]
100%|██████████| 919/919 [00:01<00:00, 660.63it/s]
100%|██████████| 919/919 [00:01<00:00, 517.69it/s]
100%|██████████| 919/919 [00:01<00:00, 558.81it/s]
100%|██████████| 919/919 [00:01<00:00, 616.12it/s]
100%|██████████| 919/919 [00:01<00:00, 557.62it/s]
100%|██████████| 919/919 [00:01<00:00, 612.18it/s]
100%|██████████| 919/919 [00:01<00:00, 549.93it/s]
100%|██████████| 919/919 [00:01<00:00, 545.59it/s]
100%|██████████| 919/919 [00:01

Fold: 3
Train: [    0     1     2 ... 36761 36762 36764]
Valid: [   10    22    23 ... 36745 36756 36763]




100%|██████████| 920/920 [00:01<00:00, 536.79it/s]
100%|██████████| 920/920 [00:01<00:00, 567.00it/s]
100%|██████████| 920/920 [00:01<00:00, 572.34it/s]
100%|██████████| 920/920 [00:01<00:00, 576.12it/s]
100%|██████████| 919/919 [00:01<00:00, 543.57it/s]
100%|██████████| 919/919 [00:01<00:00, 529.08it/s]
100%|██████████| 919/919 [00:01<00:00, 541.64it/s]
100%|██████████| 919/919 [00:01<00:00, 527.57it/s]
100%|██████████| 919/919 [00:01<00:00, 566.09it/s]
100%|██████████| 919/919 [00:01<00:00, 519.39it/s]
100%|██████████| 919/919 [00:01<00:00, 566.01it/s]
100%|██████████| 919/919 [00:01<00:00, 561.29it/s]
100%|██████████| 919/919 [00:01<00:00, 646.92it/s]
100%|██████████| 919/919 [00:01<00:00, 565.52it/s]
100%|██████████| 919/919 [00:01<00:00, 576.67it/s]
100%|██████████| 919/919 [00:01<00:00, 561.54it/s]
100%|██████████| 919/919 [00:01<00:00, 515.06it/s]
100%|██████████| 919/919 [00:01<00:00, 603.25it/s]
100%|██████████| 919/919 [00:01<00:00, 560.38it/s]
100%|██████████| 919/919 [00:01

Fold: 4
Train: [    1     2     3 ... 36762 36763 36764]
Valid: [    0    13    14 ... 36748 36753 36757]




In [8]:
dataset = load_dataset("json", data_files="/kaggle/working/folds/train_0.jsonl", split="train")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-d6884a81e6d35181/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4665.52it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 805.67it/s]
                                

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-d6884a81e6d35181/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.




In [10]:
tokenizer.pad_token_id

0