In [1]:
!pip install pytorch_lightning --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.3/800.3 KB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.4/512.4 KB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
import copy
import os
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
import pytorch_lightning as pl
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, DataCollatorWithPadding
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import gc

In [4]:
import os
os.chdir('/content/drive/MyDrive/workspace/Learning_Equality/notebook')

In [5]:
class CFG:
  ROW_DIR = Path('../data/row')
  PROCESSED_DIR = Path('../data/processed/train_data')
  TOKENIZER = "xlm-roberta-base"
  MODEL = "xlm-roberta-base"
  BATCH_PER_GPU = 4
  SEED=42
  NUM_EPOCHS=8
  LR = 1e-5
  NUM_GPUS=torch.cuda.device_count()
  NUM_JOBS=2
  AMP=True

In [6]:
train_data = np.load(CFG.PROCESSED_DIR / 'train_df.npy', allow_pickle=True)

In [7]:
topics_df = pd.read_csv(CFG.ROW_DIR / 'topics.csv')
content_df = pd.read_csv(CFG.ROW_DIR / 'content.csv')
sample_submission = pd.read_csv(CFG.ROW_DIR / 'sample_submission.csv')

In [8]:
train_df = pd.DataFrame(train_data, columns=['topic_index', 'content_id', 'topic_id', 'fold', 'label'])

In [9]:
config = AutoConfig.from_pretrained("xlm-roberta-base")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [10]:
def preprocess_test(test):
    test['title1'].fillna("Title does not exist", inplace = True)
    test['title2'].fillna("Title does not exist", inplace = True)
    # Create feature column
    test['text'] = test['title1'] + '[SEP]' + test['title2']
    # Drop titles
    test.drop(['title1', 'title2'], axis = 1, inplace = True)
    # Sort so inference is faster
    test['length'] = test['text'].apply(lambda x: len(x))
    test.sort_values('length', inplace = True)
    test.drop(['length'], axis = 1, inplace = True)
    test.reset_index(drop = True, inplace = True)
    gc.collect()
    return test

In [11]:
class MyDataset(Dataset):
    def __init__(self, tokenizer, df, topics_df, contetn_df, labels: bool=True):
        self.all_topics_id = df.topic_id.to_numpy()
        self.all_content_id = df.content_id.to_numpy()
        self.topics_dict = {id:title for id, title in zip(topics_df.id, topics_df.title)}
        self.content_dict = {id:title for id, title in zip(contetn_df.id, contetn_df.title)}
        self.all_topics_title = [self.topics_dict[id] for id in self.all_topics_id]
        self.all_content_title = [self.content_dict[id] for id in self.all_content_id]
        self.tokenizer = tokenizer
        self.labels=labels
        if self.labels:
          self.all_labels = df.label.to_numpy()
    def __len__(self):
        return len(self.all_topics_id)
    def __getitem__(self, idx):
        topic_id = self.all_topics_id[idx]
        content_id = self.all_content_id[idx]
        text = self.all_topics_title[idx] + '[SEP]' + self.all_content_title[idx]
        inputs =  self.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True
        )
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        if self.labels:
          labels = self.all_labels[idx]
          return {
                  # 'topic_id':topic_id, 
                  # 'contetn_id':content_id,
                  'input_ids':torch.tensor(input_ids, dtype=torch.long),
                  'attention_mask':torch.tensor(attention_mask, dtype=torch.long),
                  'label':torch.tensor(labels, dtype=torch.long),}
        else:
          return {
                # 'topic_id':topic_id, 
                # 'contetn_id':content_id,
                'input_ids':torch.tensor(input_ids, dtype=torch.long),
                'attention_mask':torch.tensor(attention_mask, dtype=torch.long)}

In [12]:
def func_undersampling_df(df):
    # 2. クラス0とクラス1のサンプル数を確認する
    n_samples_class_0 = df[df['label'] == 0].shape[0]
    n_samples_class_1 = df[df['label'] == 1].shape[0]

    # 3. クラス0のサンプルを適切な数だけランダムに抽出して、新しいデータフレームを作成する
    under_sampled_df = df[df['label'] == 0].sample(n_samples_class_1)

    # 4. クラス1のサンプルも同様に、適切な数だけランダムに抽出して、新しいデータフレームに追加する
    under_sampled_df = under_sampled_df.append(df[df['label'] == 1].sample(n_samples_class_1))
    return under_sampled_df

In [13]:
class FeedbackModel(pl.LightningModule):
    
    def __init__(self, model_name, learning_rate, num_train_steps, steps_per_epoch):
        super().__init__()
        self.save_hyperparameters()
        
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.num_train_steps = num_train_steps
        self.steps_per_epoch = steps_per_epoch
        self.step_scheduler_after = "batch"
        
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        
        config = AutoConfig.from_pretrained(model_name)
        
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, 1)
        
    def forward(self, ids, mask, token_type_ids=None, targets=None):

        if token_type_ids:
            transformer_out = self.transformer(ids, mask, token_type_ids)
        else:
            transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.dropout(sequence_output)

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))

        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        logits = torch.softmax(logits, dim=-1)
        loss = 0

        if targets is not None:
            loss1 = self.loss(logits1, targets, attention_mask=mask)
            loss2 = self.loss(logits2, targets, attention_mask=mask)
            loss3 = self.loss(logits3, targets, attention_mask=mask)
            loss4 = self.loss(logits4, targets, attention_mask=mask)
            loss5 = self.loss(logits5, targets, attention_mask=mask)
            loss = (loss1 + loss2 + loss3 + loss4 + loss5) / 5
            return logits, loss

        return logits, loss
    
    def loss(self, outputs, targets):
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(outputs, targets)
        return loss
    
    def monitor_metrics(self, outputs, targets):
        active_loss = (attention_mask.view(-1) == 1).cpu().numpy()
        active_logits = outputs.view(-1, 1)
        true_labels = targets.view(-1).cpu().numpy()
        outputs = active_logits.argmax(dim=-1).cpu().numpy()
        idxs = np.where(active_loss == 1)[0]
        f1_score = metrics.f1_score(true_labels[idxs], outputs[idxs], average="macro")
        return f1_score
        
    def get_targets_preds_label(self, outputs, targets, attention_mask):
        active_loss = (attention_mask.view(-1) == 1).cpu().numpy()
        active_logits = outputs.view(-1, 1)
        true_labels = targets.view(-1).cpu().numpy()
        outputs = active_logits.argmax(dim=-1).cpu().numpy()
        idxs = np.where(active_loss == 1)[0]
        return true_labels[idxs], outputs[idxs]
        
    def training_step(self, batch, batch_idx):
        ids, mask, targets = batch['batch_ids'], batch['attention_mask'], batch['label']
        logits, loss = self.forward(ids=ids, mask=mask, targets=targets)
        self.log("train_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids, mask, targets =  batch['batch_ids'], batch['attention_mask'], batch['label']
        logits, loss = self.forward(ids=ids, mask=mask, targets=targets)
        targets, preds = self.get_targets_preds_label(logits, targets, attention_mask=mask)
        self.log('val_loss', loss, on_step=True, logger=True, prog_bar=True)
        return {'val_loss': loss, 'predict': preds, 'target': targets}
        
        
    def validation_epoch_end(self, val_step_outputs):
        all_predicts = np.concatenate([val['predict'] for val in val_step_outputs], axis=0)
        all_targets = np.concatenate([val['target'] for val in val_step_outputs], axis=0)
        f1_score = metrics.f1_score(all_targets, all_predicts, average="macro")
        self.log("f1_score", f1_score, on_epoch=True, logger=True, prog_bar=False)
        return {'f1_score': f1_score}
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * self.num_train_steps), num_training_steps=self.num_train_steps)
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

In [14]:
def train_fold(df, fold):    
    pl.seed_everything(CFG.SEED)
    
    dirpath = f'./fold_{fold}'
    os.makedirs(dirpath, exist_ok=True)
    
    print(f'================================== Prepare Data for fold{fold} =====================================')
    train_samples = df[df["fold"] != fold].reset_index(drop=True)
    valid_samples = df[df["fold"] == fold].reset_index(drop=True)
    
    train_samples = func_undersampling_df(train_samples)
    
    train_dataset = MyDataset(tokenizer, train_samples, topics_df, content_df, labels=True)
    valid_dataset = MyDataset(tokenizer, valid_samples, topics_df, content_df, labels=False)
    
    train_dataloader = DataLoader(train_dataset, batch_size=CFG.BATCH_PER_GPU, shuffle=True, num_workers=CFG.NUM_JOBS, drop_last=True)
    val_dataloader = DataLoader(valid_dataset, batch_size=CFG.BATCH_PER_GPU, shuffle=False, num_workers=CFG.NUM_JOBS, drop_last=False)
    
    total_batch_size = CFG.BATCH_PER_GPU * CFG.NUM_GPUS
    steps_per_epoch = int(len(train_dataset) // total_batch_size)
    num_train_step = int(steps_per_epoch * CFG.NUM_EPOCHS)
    
    lightning_model = FeedbackModel(CFG.MODEL, CFG.LR, num_train_step, steps_per_epoch)
    
    checkpoint = pl.callbacks.ModelCheckpoint(
        monitor="f1_score",
        mode="max",
        save_top_k=1,
        save_weights_only=True,
        verbose=True,
        dirpath=dirpath,
    )
    
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
    
    early_stopping = pl.callbacks.EarlyStopping(
        monitor="f1_score",
        min_delta=0.0,
        patience=4,
        mode="max",
    )
    
    call_backs = [checkpoint, lr_monitor, early_stopping]
    
    trainer = pl.Trainer(
        max_epochs=CFG.NUM_EPOCHS,
        callbacks=call_backs,
        gpus=-1 if CFG.NUM_GPUS != 1 else [0],
        strategy="ddp" if CFG.NUM_GPUS != 1 else None,
        precision = 16 if CFG.AMP else 32,
        amp_backend = "native",
    )
    
    print(f'================================== Start Training fold{fold} =====================================')
    trainer.fit(lightning_model, train_dataloader, val_dataloader)
    
    best_model_path = checkpoint.best_model_path
    print("best model path: ", best_model_path)

In [15]:
print(f"================================== Start Running =====================================")
for fold in range(5):
    train_fold(train_df, fold)


INFO:lightning_lite.utilities.seed:Global seed set to 42




Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores



INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name        | Type            | Params
------------------------------------------------
0 | transformer | XLMRobertaModel | 278 M 
1 | dropout     | Dropout         | 0     
2 | dropout1    | Dropout         | 0     
3 | dropout2    | Dropout         | 0     
4 | dropout3    | Dropout         | 0     
5 | dropout4    | Dropout         | 0     
6 | dropout5    | Dropout         | 0     
7 | output      | Linear          | 769   
------------------------------------------------
278 M     Trainable params
0         Non-trainable params
278 M     Total params
556.089   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


RuntimeError: ignored

In [None]:
train_df.head()