## Part 2 -- Bert Application


In [2]:
import os
from pathlib import Path

import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, BertModel

In [4]:
# Global Parameter Initialization
batch_size = 16
text_max_length = 128
epochs = 100
lr = 3e-5
validation_ratio = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging_per_step = 50

dataset_dir = Path('./data/')
model_dir = Path('./models/bert-checkpoints')
os.makedirs(model_dir) if not os.path.exists(model_dir) else ''

print(f'Device: {device}')


Device: cuda


In [9]:
# Data Loading and Preprocessing
train_data = pd.read_csv('./data/train.csv')
train_data['title'] = train_data['title'].fillna('')
train_data['abstract'] = train_data['abstract'].fillna('')

test_data = pd.read_csv('./data/test.csv')
test_data['title'] = test_data['title'].fillna('')
test_data['abstract'] = test_data['abstract'].fillna('')

# Integration
train_data['text'] = train_data['title'] + ' ' + train_data['author'].fillna('') + ' ' + train_data['abstract'] + train_data['Keywords'].fillna('')
test_data['text'] = test_data['title'] + ' ' + test_data['author'].fillna('') + ' ' + test_data['abstract'] + test_data['Keywords'].fillna('')

In [10]:
# Split Validation Dataset
validation_data = train_data.sample(frac=validation_ratio)
train_data = train_data[~train_data.index.isin(validation_data.index)]

In [21]:
# Dataset Definition
class ExpDataset(Dataset):

    def __init__(self, mode: str = 'train') -> None:
        super().__init__()
        self.mode: str = mode
        if mode == 'train':
            self.dataset: pd.DataFrame = train_data
        elif mode == 'validation':
            self.dataset: pd.DataFrame = validation_data
        elif mode == 'test':
            self.dataset: pd.DataFrame = test_data
        else:
            raise Exception(f'Unknown mode "{mode}"')
        
    def __getitem__(self, index) -> (str, int):
        data = self.dataset.iloc[index]
        text = data['text']
        if self.mode == 'test':
            label = data['uuid']
        else:
            label = data['label']
        return text, label
    
    def __len__(self):
        return len(self.dataset)
    
train_dataset = ExpDataset(mode='train')
validation_dataset = ExpDataset("validation")


In [26]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [27]:
bert = BertModel.from_pretrained('bert-base-uncased', mirror='tuna')

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
