In [2]:
import os, sys

import numpy as np
import pandas as pd 
from datasets import load_dataset

import importlib
from tqdm import tqdm
from joblib import Parallel, delayed
from copy import copy

from IPython.display import clear_output

from transformers import (
    AutoConfig,
    AutoTokenizer,
    FlaxAutoModelForSequenceClassification,
    HfArgumentParser,
    PretrainedConfig,
    TrainingArguments,
    is_tensorboard_available,
)

from flax.training.common_utils import get_metrics, onehot, shard


data_root = "/kaggle/input/feedback-prize-effectiveness/"
train = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/train.csv")


In [3]:
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

def _prepare_training_data_helper(args, tokenizer, df, is_train):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        if is_train:
            filename = os.path.join(args.input, "train", idx + ".txt")
        else:
            filename = os.path.join(args.input, "test", idx + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        sep_token = tokenizer.sep_token

        # encoded_text = tokenizer.encode_plus(
        #     discourse_type + sep_token + discourse_text + sep_token + text,
        #     add_special_tokens=False,
        #     padding="max_length",
        #     truncation=True,
        #     max_length=512 ##TODO: update max_length
        # )

        # encoded_text = tokenizer.encode_plus(
        #     discourse_type + " " + discourse_text,
        #     text,
        #     add_special_tokens=False,
        #     padding="max_length",
        #     truncation=True,
        #     max_length=512 ##TODO: update max_length
        # )


        encoded_text = tokenizer.encode_plus(
            discourse_type.lower() + sep_token + discourse_text.lower(),
            text.lower(),
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=512 ##TODO: update max_length
        )
        input_ids = encoded_text["input_ids"]

        sample = {
            # "discourse_id": row["discourse_id"],
            "input_ids": input_ids,
            # "discourse_text": discourse_text,
            # "essay_text": text,
            "attention_mask": encoded_text["attention_mask"],
        }

        if "token_type_ids" in encoded_text:
            sample["token_type_ids"] = encoded_text["token_type_ids"]

        try:
            label = row["discourse_effectiveness"]
            sample["labels"] = LABEL_MAPPING[label]
        except:
            sample["labels"] = 0
        

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, args, num_jobs, is_train):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)(args, tokenizer, df, is_train) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [4]:
sys.path.append("../configs")
cfg = copy(importlib.import_module("elu_config").cfg)

print(cfg.model_name_or_path)

roberta-base


In [7]:
# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
    cfg.model_name_or_path,
    num_labels=cfg.num_labels,
    #finetuning_task=data_args.task_name,
    #use_auth_token=True if cfg.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path,
    use_fast=not cfg.use_slow_tokenizer,
    #use_auth_token=True if cfg.use_auth_token else None,
)
# model = FlaxAutoModelForSequenceClassification.from_pretrained(
#     # cfg.model_name_or_path,
#     cfg.model_name_or_path,
#     config=config,
#     ignore_mismatched_sizes=True,
#     #use_auth_token=True if cfg.use_auth_token else None,
# )

cfg.model_name_or_path

'roberta-base'

In [22]:
import json
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold

# kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
# ## stratified Kfold for train dataframe using discourse_type and discourse_effectiveness

# for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_type"], train["discourse_effectiveness"])):
# # for fold, (train_index, valid_index) in enumerate(kf.split(train, train["discourse_effectiveness"], train["essay_id"])):

#     train_temp = train.iloc[train_index]
#     valid_temp = train.iloc[valid_index]

#     train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=96, is_train=True)
#     val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=96, is_train=True)

#     df = pd.DataFrame.from_records(train_data)
#     df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

#     df = pd.DataFrame.from_records(val_data)
#     df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)
#     clear_output()
#     break
#     # print("Fold:", fold)
#     # print("Train:", train_index)
#     # print("Valid:", valid_index)
#     # print("\n")

gkf = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=42)

for fold, (train_index, valid_index) in enumerate(gkf.split(train, train["discourse_effectiveness"], train["essay_id"])):
    train_temp = train.iloc[train_index]
    valid_temp = train.iloc[valid_index]

    train_data = prepare_training_data(train_temp, tokenizer, cfg, num_jobs=96, is_train=True)
    val_data = prepare_training_data(valid_temp, tokenizer, cfg, num_jobs=96, is_train=True)

    df = pd.DataFrame.from_records(train_data)
    df.to_json(f"/kaggle/working/folds/train_{fold}.jsonl", orient="records", lines=True)

    df = pd.DataFrame.from_records(val_data)
    df.to_json(f"/kaggle/working/folds/valid_{fold}.jsonl", orient="records", lines=True)
    clear_output()

    # print("Fold:", fold)
    # print("Train:", train_index)
    # print("Valid:", valid_index)
    # print("\n")

In [23]:
np.intersect1d(train_temp.essay_id, valid_temp.essay_id)

array([], dtype=object)

In [6]:
# ## generate test dataset
# test = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/test.csv")
# test_data = prepare_training_data(test, tokenizer, cfg, num_jobs=96, is_train=False)
# df = pd.DataFrame.from_records(test_data)
# df.to_json(f"/kaggle/working/folds/test.jsonl", orient="records", lines=True)

In [7]:
## data collator with dynamic padding
# def train_data_collator(rng:)
import jax
import datasets
from typing import Any, Callable, Dict, Optional, Tuple

rng = jax.random.PRNGKey(1)#cfg.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())

Array = Any
Dataset = datasets.arrow_dataset.Dataset
PRNGKey = Any



In [10]:
def eval_data_collator(dataset: Dataset, batch_size: int):
    """Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
    for i in range(len(dataset) // batch_size):
        batch = dataset[i * batch_size : (i + 1) * batch_size]
        batch = {k: np.array(v) for k, v in batch.items()}

        yield batch

rng = jax.random.PRNGKey(cfg.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())
rng, input_rng = jax.random.split(rng)
eval_dataset   = load_dataset("json", data_files=f"/kaggle/working/folds/valid_{cfg.fold}.jsonl", split="train")
eval_loader = eval_data_collator(eval_dataset, 4)

train_dataset = load_dataset("json", data_files=f"/kaggle/working/folds/train_{cfg.fold}.jsonl", split="train")
# train_loader = train_data_collator(train_dataset, 4)

## decode text using tokenizer
text = eval_dataset[0]['input_ids']
text = tokenizer.decode(text)
print(repr(text))




'<s> claim</s> i think that the face is a natural landform because there is no life on mars that we have descovered yet</s></s> hi, i\'m isaac, i\'m going to be writing about how this face on mars is a natural landform or if there is life on mars that made it. the story is about how nasa took a picture of mars and a face was seen on the planet. nasa doesn\'t know if the landform was created by life on mars, or if it is just a natural landform. on my perspective, i think that the face is a natural landform because i dont think that there is any life on mars. in these next few paragraphs, i\'ll be talking about how i think that is is a natural landform i think that the face is a natural landform because there is no life on mars that we have descovered yet. if life was on mars, we would know by now. the reason why i think it is a natural landform because, nobody live on mars in order to create the figure. it says in paragraph 9, "it\'s not easy to target cydonia," in which he is saying th

In [49]:
for batch in eval_loader:
    print(batch['input_ids'].shape)    
    break

(4,)


  batch = {k: np.array(v) for k, v in batch.items()}


In [12]:
len(np.nonzero(train_dataset[0]['input_ids'])[0])

512

In [14]:
train_dataset[0]['input_ids']

[22171,
 415,
 993,
 427,
 363,
 2087,
 419,
 358,
 3389,
 2057,
 788,
 881,
 713,
 419,
 746,
 1305,
 420,
 8807,
 427,
 457,
 524,
 849,
 32212,
 1966,
 321,
 16003,
 112,
 1413,
 1202,
 19169,
 112,
 1413,
 1202,
 1117,
 385,
 408,
 3698,
 647,
 804,
 529,
 2087,
 420,
 8807,
 419,
 358,
 3389,
 2057,
 788,
 494,
 712,
 713,
 419,
 1305,
 420,
 8807,
 427,
 1026,
 441,
 114,
 484,
 1722,
 419,
 647,
 804,
 8985,
 1819,
 358,
 4387,
 387,
 8807,
 391,
 358,
 2087,
 474,
 1876,
 420,
 363,
 5541,
 114,
 8985,
 1696,
 571,
 861,
 712,
 363,
 2057,
 788,
 474,
 2828,
 517,
 1305,
 420,
 8807,
 112,
 494,
 712,
 441,
 419,
 756,
 358,
 3389,
 2057,
 788,
 114,
 1651,
 717,
 6751,
 112,
 415,
 993,
 427,
 363,
 2087,
 419,
 358,
 3389,
 2057,
 788,
 881,
 415,
 17767,
 993,
 427,
 713,
 419,
 698,
 1305,
 420,
 8807,
 114,
 655,
 878,
 1407,
 1279,
 23650,
 112,
 415,
 1284,
 408,
 3476,
 647,
 804,
 415,
 993,
 427,
 419,
 419,
 358,
 3389,
 2057,
 788,
 100,
 141,
 993,
 427,
 363,
 208