In [None]:
!pip install transformers

In [4]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from models.bert.bert_train import BertTrain

import warnings 
warnings.filterwarnings('ignore')

In [None]:
epochs = 1
batch_size = 32
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [5]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

df['name_1'] = df['name_1'].str.lower()
df['name_2'] = df['name_2'].str.lower()

In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
from sklearn.utils import resample

duplicate = df[df['is_duplicate'] == 1]
not_duplicate = df[df["is_duplicate"] == 0]

not_duplicate_downsample = resample(not_duplicate,
             replace=True,
             n_samples=int(len(duplicate) * 2),
             random_state=42)

df = pd.concat([duplicate, not_duplicate_downsample])

(3658, 4)


In [9]:
names = pd.concat([df['name_1'], df['name_2']])
names = names.values

labels = df['is_duplicate'].values

In [10]:
max_len = 0
for name in names:
    inputs_ids = tokenizer.encode(name, add_special_tokens=True)
    max_len = max(len(inputs_ids), max_len)

In [None]:
input_ids = []
attention_masks = []

for name_1, name_2 in zip(df['name_1'], df['name_2']):
    encoded_list = tokenizer.encode_plus(name_1,
                                         name_2,
                                         add_special_tokens=True,
                                         max_length=max_len,
                                         pad_to_max_length=True,
                                         return_attention_mask=True,
                                         return_tensors='pt')
    
    input_ids.append(encoded_list['input_ids'])
    attention_masks.append(encoded_list['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [12]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [13]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(
            train_dataset,
            batch_size = batch_size,
            sampler=RandomSampler(train_dataset)
)

validation_dataloader = DataLoader(
            val_dataset,
            batch_size = batch_size,
            sampler=SequentialSampler(train_dataset)
)

In [14]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = True,
    output_hidden_states = True,
)

model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [18]:
trainer = BertTrain(
    epochs=3,
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=validation_dataloader,
    optimizer=optimizer,
    scheduler=scheduler
)

stats = trainer()

  0%|          | 0/3 [00:00<?, ?it/s]


Average training loss: 0.16

Running Validation...
  train f1_score: 0.94
  val f1_score: 0.98
  Validation Loss: 0.07

Average training loss: 0.05

Running Validation...
  train f1_score: 0.96
  val f1_score: 0.98
  Validation Loss: 0.07

Average training loss: 0.06

Running Validation...
  train f1_score: 0.97
  val f1_score: 0.98
  Validation Loss: 0.07


In [25]:
train_f1 = stats[2]['Train f1_macro']
val_f1 = stats[2]['Valid f1_macro']

In [26]:
print(f'train_f1_macro: {train_f1}')
print(f'val_f1_macro: {val_f1}')

train_f1_macro: 0.9712433429107382
val_f1_macro: 0.9836021505376344
