In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle competitions download -c feedback-prize-effectiveness

!unzip feedback-prize-effectiveness.zip
!pip install transformers datasets sentencepiece

!kaggle kernels output tanlikesmath/here-have-some-folds -p ./


## Functions for reading text files and correcting encoding errors


Correction code from: https://www.kaggle.com/competitions/feedback-prize-2021/discussion/313330

In [1]:
import transformers

In [2]:
cfg = {
    "num_proc": 6,
    # data
    "k_folds": 5,
    "max_length": 1700,
    "padding": False,
    "stride": 0,
    "data_dir": ".",
    "load_from_disk": None, # if you already tokenized, you can load it through this
    "pad_multiple": 512,
    # model
    "model_name_or_path": "microsoft/deberta-large",
    "dropout": 0.1,
    # to put in TrainingArguments
    "trainingargs": {
        "output_dir": "output",
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 2,
        "per_device_eval_batch_size": 2,
        "learning_rate": 7e-6,
        "weight_decay": 0,
        "num_train_epochs": 2,
        "warmup_ratio": 0,
        "optim": 'adamw_torch',
        "logging_steps": 50,
        "save_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "report_to": "none",
        "group_by_length": True,
        "save_total_limit": 1,
        "metric_for_best_model": "loss",
        "greater_is_better": False,
        "seed": 43,
        "fp16":True,
        
        # you should probably set "fp16" to True, but it doesn't really matter on Kaggle
    }
}

In [3]:
import re
import pickle
import codecs
import warnings
import logging
from functools import partial
from pathlib import Path
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple

import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, set_seed

from datasets import Dataset, load_from_disk

# https://www.kaggle.com/competitions/feedback-prize-2021/discussion/313330
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def read_text_files(example, data_dir):

    id_ = example["essay_id"]

    with open(data_dir / "train" / f"{id_}.txt", "r") as fp:
        example["text"] = resolve_encodings_and_normalize(fp.read())

    return example

set_seed(cfg["trainingargs"]["seed"])

# change logging to not be bombarded by messages
# if you are debugging, the messages will likely be helpful
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

# Adding essay texts to dataset and adding special tokens to tokenizer

In [4]:
data_dir = Path(cfg["data_dir"])

if cfg["load_from_disk"]:
    if not cfg["load_from_disk"].endswith(".dataset"):
        cfg["load_from_disk"] += ".dataset"
    ds = load_from_disk(cfg['load_from_disk'])
    
    pkl_file = f"{cfg['load_from_disk'][:-len('.dataset')]}_pkl"
    with open(pkl_file, "rb") as fp:
        grouped = pickle.load(fp)
    
    print("Loading from saved files")
else:
    train_df = pd.read_csv(data_dir / "train.csv")

    text_ds = Dataset.from_dict({"essay_id": train_df.essay_id.unique()})

    text_ds = text_ds.map(
        partial(read_text_files, data_dir=data_dir),
        num_proc=cfg["num_proc"],
        batched=False,
        desc="Loading text files",
    )

    text_df = text_ds.to_pandas()

    train_df["discourse_text"] = [
        resolve_encodings_and_normalize(x) for x in train_df["discourse_text"]
    ]

    train_df = train_df.merge(text_df, on="essay_id", how="left")

disc_types = [
    "Claim",
    "Concluding Statement",
    "Counterclaim",
    "Evidence",
    "Lead",
    "Position",
    "Rebuttal",
]
cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in disc_types}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in disc_types}

label2id = {
    "Adequate": 0,
    "Effective": 1,
    "Ineffective": 2,
}

tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)
cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

          

Loading text files #1:   0%|          | 0/699 [00:00<?, ?ex/s]

Loading text files #0:   0%|          | 0/699 [00:00<?, ?ex/s]

  

Loading text files #2:   0%|          | 0/699 [00:00<?, ?ex/s]

Loading text files #4:   0%|          | 0/698 [00:00<?, ?ex/s]

Loading text files #5:   0%|          | 0/698 [00:00<?, ?ex/s]

Loading text files #3:   0%|          | 0/698 [00:00<?, ?ex/s]

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [5]:
cls_id_map

{'Claim': 50265,
 'Concluding Statement': 50266,
 'Counterclaim': 50267,
 'Evidence': 50268,
 'Lead': 50269,
 'Position': 50270,
 'Rebuttal': 50271}

In [6]:
max_length = 0

In [7]:
def find_positions(example):

    text = example["text"][0]
    
    # keeps track of what has already
    # been located
    min_idx = 0
    
    # stores start and end indexes of discourse_texts
    idxs = []
    
    for dt in example["discourse_text"]:
        # calling strip is essential
        matches = list(re.finditer(re.escape(dt.strip()), text))
        
        # If there are multiple matches, take the first one
        # that is past the previous discourse texts.
        if len(matches) > 1:
            for m in matches:
                if m.start() >= min_idx:
                    break
        # If no matches are found
        elif len(matches) == 0:
            idxs.append([-1]) # will filter out later
            continue  
        # If one match is found
        else:
            m = matches[0]
            
        idxs.append([m.start(), m.end()])

        min_idx = m.start()

    return idxs

def tokenize(example):
    example["idxs"] = find_positions(example)

    text = example["text"][0]
    chunks = []
    labels = []
    prev = 0

    zipped = zip(
        example["idxs"],
        example["discourse_type"],
        example["discourse_effectiveness"],
    )
    for idxs, disc_type, disc_effect in zipped:
        # when the discourse_text wasn't found
        if idxs == [-1]:
            continue

        s, e = idxs

        # if the start of the current discourse_text is not 
        # at the end of the previous one.
        # (text in between discourse_texts)
        if s != prev:
            chunks.append(text[prev:s])
            prev = s

        # if the start of the current discourse_text is 
        # the same as the end of the previous discourse_text
        if s == prev:
            chunks.append(cls_tokens_map[disc_type])
            chunks.append(text[s:e])
            chunks.append(end_tokens_map[disc_type])
        
        prev = e

        labels.append(label2id[disc_effect])

    tokenized = tokenizer(
        " ".join(chunks),
        padding=False,
        truncation=True,
        max_length=cfg["max_length"],
        add_special_tokens=True,
    )
    
    # at this point, labels is not the same shape as input_ids.
    # The following loop will add -100 so that the loss function
    # ignores all tokens except CLS tokens

    # idx for labels list
    idx = 0
    final_labels = []
    for id_ in tokenized["input_ids"]:
        # if this id belongs to a CLS token
        if id_ in cls_id_map.values():
            final_labels.append(labels[idx])
            idx += 1
        else:
            # -100 will be ignored by loss function
            final_labels.append(-100)
    
    tokenized["labels"] = final_labels
    
    tokenized['len'] = sum( tokenized['attention_mask'] ) 

    return tokenized

In [8]:
# I frequently restart my notebook, so to reduce time
# you can set this to just load the tokenized dataset from disk.
# It gets loaded in the 3rd code cell, but a check is done here
# to skip tokenizing
if cfg["load_from_disk"] is None:

    # make lists of discourse_text, discourse_effectiveness
    # for each essay
    grouped = train_df.groupby(["essay_id"]).agg(list)

    ds = Dataset.from_pandas(grouped)

    ds = ds.map(
        tokenize,
        batched=False,
        num_proc=cfg["num_proc"],
        desc="Tokenizing",
    )

    save_dir = f"{cfg['trainingargs']['output_dir']}"
    ds.save_to_disk(f"{save_dir}.dataset")
    with open(f"{save_dir}_pkl", "wb") as fp:
        pickle.dump(grouped, fp)
    print("Saving dataset to disk:", cfg['trainingargs']['output_dir'])
    


# basic kfold 
def get_folds(df, k_folds=5):
    df = pd.read_csv('train_folds.csv').groupby(['essay_id','kfold']).count().reset_index()
    
    df = pd.merge( grouped, df, on='essay_id', how='left' )
    
    idxs = []
    
    for i in range(k_folds):
        
        idxs.append( df[df.kfold==i].index.values)
    
    return idxs
    
fold_idxs = get_folds(ds["labels"], cfg["k_folds"])

       

Tokenizing #0:   0%|          | 0/699 [00:00<?, ?ex/s]

 

Tokenizing #1:   0%|          | 0/699 [00:00<?, ?ex/s]

 

Tokenizing #2:   0%|          | 0/699 [00:00<?, ?ex/s]

 

Tokenizing #3:   0%|          | 0/698 [00:00<?, ?ex/s]

 

Tokenizing #4:   0%|          | 0/698 [00:00<?, ?ex/s]

 

Tokenizing #5:   0%|          | 0/698 [00:00<?, ?ex/s]

Saving dataset to disk: output


In [9]:
for f in fold_idxs:
    print(f.shape)

(838,)
(841,)
(836,)
(840,)
(836,)


In [10]:
max_length = 0

for d in ds:
    max_length = max( max_length, d['len'] )

print(max_length)

1695


# Check data to make sure all discourse texts are represented as CLS tokens and labels

1 fails, but that seems like an error by whomever made the feedback prize dataset. The discourse_text ends with the word florida whereas the essay text ends in LOCATION_NAME. I didn't do anything to fix this, but with more sophisticated matching, you should be able to catch instances like this. Hopefully the hidden set doesn't have any, but it wouldn't be too hard to probe for this.

In [11]:
bad_matches = []
cls_ids = set(list(cls_id_map.values()))
for id_, l, ids, dt in zip(ds["essay_id"], ds["labels"], ds["input_ids"], grouped.discourse_text):
    
    # count number of labels (ignoring -100)
    num_cls_label = sum([x!=-100 for x in l])
    # count number of cls ids
    num_cls_id = sum([x in cls_ids for x in ids])
    # true number of discourse_texts
    num_dt = len(dt)
    
    if num_cls_label != num_dt or num_cls_id != num_dt:
        bad_matches.append((id_, l, ids, dt))
        
print("Num bad matches", len(bad_matches))
temp = train_df[train_df["essay_id"]==bad_matches[0][0]]
temp_txt = temp.text.values[0]
print(temp_txt)
print("*"*100)
print([x for x in temp.discourse_text if x.strip() not in temp_txt])

Num bad matches 1
If we limit our car usage throughout the year we can cut the rate of carbon dioxiod we produce by 1/2! That will save uor trees, animals, and even our own lifes in the future. Also the ablity to save as many fossil fuels we have remaining cause everyone knows we are starting to run completly out and can not creat any more. By doing this we save time and energy and creat a healthy envirnment.

If we slowly progress the fact that we are only using our automobiles 345/360 days a year will cut back our usage for fossile fuels by 15%. This whole thing is point less how they have us in here for two days im missing my education. We could have finished this in one day and had the rest of the week to get back on the track of learning. I've missed both days of weight lifting, algebra, and my world history that i do not want to fail again! If their are any people actually gonna sit down and take the time to read this then

DO NOT DO THIS NEXT YEAR

.

They are giving us cold lun

# Compare discourse texts, essay with CLS/END tokens injected, and bare essay text

This is one final visual check to make sure it looks alright.

In [12]:
# for t in ds[0]["discourse_text"]:
#     print(t, "\n")
print("*"*100)
print(tokenizer.decode(ds[0]["input_ids"]))
print("*"*100)
print(ds[0]["text"][0])

****************************************************************************************************
[CLS][CLS_LEAD] Driverless cars are exaclty what you would expect them to be. Cars that will drive without a person actually behind the wheel controlling the actions of the vehicle. The idea of driverless cars going in to developement shows the amount of technological increase that the wolrd has made. The leader of this idea of driverless cars are the automobiles they call Google cars. The arduous task of creating safe driverless cars has not been fully mastered yet. [END_LEAD]   [CLS_POSITION] The developement of these cars should be stopped immediately because there are too many hazardous and dangerous events that could occur. [END_POSITION] 

One thing that the article mentions is that  [CLS_CLAIM] the driver will be alerted when they will need to take over the driving responsibilites of the car. [END_CLAIM]   [CLS_EVIDENCE] This is such a dangerous thing because we all know that whe

# Training

In [None]:
import gc
import torch
from transformers import Trainer, TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification

args = TrainingArguments(**cfg["trainingargs"],gradient_accumulation_steps=4,lr_scheduler_type="cosine")

# If using longformer, you will want to pad to a multiple of 512
# For most others, you'll want to pad to a multiple of 8
collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, pad_to_multiple_of=cfg["pad_multiple"], padding=True
)

output = args.output_dir
for fold in range(cfg["k_folds"]):
    
    print(f'Fold {fold}')
    
    args.output_dir = f"{output}-fold{fold}"
    
    model_config = AutoConfig.from_pretrained(
            cfg["model_name_or_path"],
        )
    model_config.update(
        {
            "num_labels": 3,
            "cls_tokens": list(cls_id_map.values()),
            "label2id": label2id,
            "id2label": {v:k for k, v in label2id.items()},
        }
    )
    
    model = AutoModelForTokenClassification.from_pretrained(cfg["model_name_or_path"], config=model_config)
    
    # Because tokens were added, it is important to resize the embeddings
    model.resize_token_embeddings(len(tokenizer)) 

    # split dataset to train and eval
    keep_cols = {"input_ids", "attention_mask", "labels"}
    train_idxs =  list(chain(*[i for f, i in enumerate(fold_idxs) if f != fold]))
    train_dataset = ds.select(train_idxs).remove_columns([c for c in ds.column_names if c not in keep_cols])
    eval_dataset = ds.select(fold_idxs[fold]).remove_columns([c for c in ds.column_names if c not in keep_cols])


    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    
    trainer.train()
    
    del model
    gc.collect()
    torch.cuda.empty_cache()

Fold 0


Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


# Get best scores

I'm sure with more tuning (and a larger model), the score can improve even more!

In [None]:
import json
best_metrics = []

for fold in range(cfg["k_folds"]):
    folder = Path(f"{output}-fold{fold}")
    checkpoint = sorted(list(folder.glob("checkpoint*")))[-1]
    with open(checkpoint/"trainer_state.json", "r") as fp:
        data = json.load(fp)
        best_metrics.append(data["best_metric"])
    
print(best_metrics)
average = sum(best_metrics)/len(best_metrics)
average