<a href="https://colab.research.google.com/github/TanimotoRui/signate/blob/main/15_tubo_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

このnotebookは下記webサイトからforkして作成しました。

https://signate.jp/competitions/754/discussions/refactorcv0816-lb0825-deberta-base

In [1]:
!nvidia-smi

Wed Sep  7 07:36:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
!pip install transformers datasets sentencepiece

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import

In [4]:
from pathlib import Path
from multiprocessing import cpu_count
import random
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, f1_score

from datasets import load_dataset, Dataset, DatasetDict

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import EvalPrediction
from transformers import set_seed

import torch
from torch import nn

# Config

In [5]:
EXP_NAME = 'exp015'
INPUT_DIR = Path('/content/drive/MyDrive/00_datascience/19_ufj_bank/input')
OUTPUT_DIR = Path('/content/drive/MyDrive/00_datascience/19_ufj_bank/output')
DEBUG = False
TEXT_COLUMNS = ['goal', 'country', 'duration', 'category1', 'category2', 'html_content']
MODEL_NAME = 'microsoft/deberta-v3-base'
N_SPLIT = 4
TRN_FOLDS = [0, 1, 2, 3, 4]
MAX_LEN = 512
SEED = 3090

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR/EXP_NAME),
    seed=SEED,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=1,
    evaluation_strategy='epoch',
    save_total_limit=1,
    save_strategy ='epoch',    
    metric_for_best_model='f1_score',
    load_best_model_at_end=True,
    greater_is_better=True
)

# Cross Validation

In [6]:
def create_folds(data, num_splits):
    data["kfold"] = -1

    mskf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    data_labels = data['state']

    for f, (t_, v_) in enumerate(mskf.split(data, data_labels)):
        data.loc[v_, "kfold"] = f

    return data

data = pd.read_csv(INPUT_DIR / "train.csv")
data = create_folds(data, num_splits=N_SPLIT)
data.to_csv("train_folds.csv", index=False)
print("Folds created successfully")

Folds created successfully


# Preprocessing

In [7]:
def text_to_input_ids(examples):
    return tokenizer(examples['text'], padding=False, truncation=True, max_length=MAX_LEN)

def connect_text(df, text_cols, sep):
    text_df = df[text_cols].fillna('NAN').astype(str)
    connected_text = text_df[text_cols[0]].str.cat(text_df[text_cols[1:]], sep=sep)

    return connected_text

# Augmentation

In [8]:
class RandomMask:
    def __init__(self, tokenizer, prob=0.5, mask_prob=0.15):
        self.mask_token_id = tokenizer.mask_token_id
        self.prob = prob
        self.mask_prob = mask_prob

    def __call__(self, examples):        
        if random.random() > self.prob:
            examples['input_ids'] = [self._mask(input_ids) for input_ids in examples['input_ids']]

        return examples

    def _mask(self, input_ids: list) -> list:
        length = len(input_ids)
        mask_idx = random.sample(range(1, length), int(length*self.mask_prob)) # random masking except [CLS]
        for i in mask_idx:
            input_ids[i] = self.mask_token_id

        return input_ids

# Trainer

In [9]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels').float()
        _inputs = {k: v for k, v in inputs.items() if k != 'labels'} # to not automatically calculate losses
        
        outputs = model(**_inputs)
        logits = outputs.get('logits')
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(logits.view(-1), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(p: EvalPrediction):
    preds = sigmoid(p.predictions)
    labels = p.label_ids
    score = threshold_search(labels, preds)['f1']
    metrics = {'f1_score': score}    
    
    return metrics

# Main

In [10]:
if __name__ == "__main__":
    df = pd.read_csv('train_folds.csv')
    test_df = pd.read_csv(INPUT_DIR / 'test.csv')
    if DEBUG:
        df = df.head(50)
        test_df = test_df.head(50)
        training_args.num_train_epochs = 1

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # for dynamic paddnig
    # train_transform = RandomMask(tokenizer, prob=0.5, mask_prob=0.1) # [MASK] augmentation

    df['text'] = connect_text(df, TEXT_COLUMNS, tokenizer.sep_token)
    df['label'] = df['state']
    ds = Dataset.from_pandas(df[['text', 'label', 'kfold']])
    ds = ds.map(text_to_input_ids, batched=True, num_proc=cpu_count())

    test_df['text'] = connect_text(test_df, TEXT_COLUMNS, tokenizer.sep_token)
    test_ds = Dataset.from_pandas(test_df[['text']])
    test_ds = test_ds.map(text_to_input_ids, batched=True, num_proc=cpu_count())
    test_ds = test_ds.remove_columns(['text'])

    oof = []
    labels = []
    test_preds = []
    for fold in TRN_FOLDS:
        set_seed(SEED)
        ds_i = DatasetDict(
            {
                'train': ds.filter(lambda x: x['kfold']!=fold),
                'eval': ds.filter(lambda x: x['kfold']==fold),
            }
        )
        ds_i = ds_i.remove_columns(['text', 'kfold'])
        # ds_i['train'].set_transform(train_transform)

        # train
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
        training_args.output_dir = str((OUTPUT_DIR / EXP_NAME) / f'fold_{fold}')
        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=ds_i["train"],
            eval_dataset=ds_i["eval"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )
        trainer.train()

        # inference
        oof_i = trainer.predict(ds_i['eval'])
        test_preds_i = trainer.predict(test_ds).predictions
        oof.append(sigmoid(oof_i.predictions))
        labels.append(oof_i.label_ids)
        test_preds.append(test_preds_i)

    oof = np.vstack(oof).flatten()
    labels = np.hstack(labels)
    search_result = threshold_search(labels, oof)
    print('OOF Score: ', search_result['f1'], 'Threshold:', search_result['threshold'])


    test_df['label'] = (np.mean(test_preds, axis=0) > search_result['threshold']).astype(int)
    test_df[['id', 'label']].to_csv((OUTPUT_DIR/EXP_NAME)/ 'sub.csv', header=False, index=False)

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


   

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

   

#0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.521846,0.787998
2,0.548200,0.391758,0.827148
3,0.407300,0.418497,0.832438


***** Running Evaluation *****
  Num examples = 1959
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-490
Configuration saved in /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-490/config.json
Model weights saved in /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-490/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-490/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-490/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1959
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-980
Configuration saved in /content/drive/MyDrive/00_datascience/19_ufj_bank/output/exp015/fold_0/checkpoint-9

KeyboardInterrupt: ignored