<a href="https://colab.research.google.com/github/ThuanPhong0126/PaternRecognize-project-cs338/blob/main/BERT_Fine_Tuning_Quora_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd /content/gdrive/MyDrive/[]Nhan Dang/Question-Similarity

/content/gdrive/MyDrive/[]Nhan Dang/Question-Similarity


In [35]:
import pandas

train = pandas.read_csv("./Quora dataset/train.csv", index_col='id')
dev = pandas.read_csv("./Quora dataset/dev.csv", index_col='id')
test = pandas.read_csv("./Quora dataset/test.csv", index_col='id')

X_train = train[['question1', 'question2']]
X_validation = dev[['question1', 'question2']]
X_test = test[['question1', 'question2']]

y_train = train[['is_duplicate']]
y_validation = dev[['is_duplicate']]
y_test = test[['is_duplicate']]

X_train

Unnamed: 0_level_0,question1,question2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan..."
8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?"
...,...,...
404284,What does Jainism say about homosexuality?,What does Jainism say about Gays and Homosexua...
404285,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...
404286,Do you believe there is life after death?,Is it true that there is life after death?
404288,What is the approx annual cost of living while...,I am having little hairfall problem but I want...


In [None]:
!pip install transformers

In [7]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased', do_lower_case=True)

In [None]:
from tqdm import tqdm

X_train["question1_length"] = X_train["question1"].progress_apply(lambda question: 
                                                                                      len(tokenizer.tokenize(question)))
X_train["question2_length"] = X_train["question2"].progress_apply(lambda question: 
                                                                                      len(tokenizer.tokenize(question)))
X_train["joint_length"] = X_train["question1_length"] + X_train["question2_length"]
X_train["joint_length"].max()

In [None]:
max_length = 310
tokenizer.encode_plus(X_train.iloc[0]["question1"], X_train.iloc[0]["question2"], max_length=max_length, 
                      pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', truncation=True)

In [16]:
import torch

from tqdm import tqdm
from torch.utils.data import TensorDataset



def convert_to_dataset_torch(data: pandas.DataFrame, labels: pandas.Series) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["question1"], row["question2"], max_length=max_length, pad_to_max_length=True, 
                      return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    
    return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [17]:
train = convert_to_dataset_torch(X_train, y_train)
validation = convert_to_dataset_torch(X_validation, y_validation)

100%|██████████| 1920/1920 [00:00<00:00, 1930.37it/s]
100%|██████████| 480/480 [00:00<00:00, 1859.25it/s]


In [18]:
import multiprocessing

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 8

core_number = multiprocessing.cpu_count()

train_dataloader = DataLoader(
            train, 
            sampler = RandomSampler(train), 
            batch_size = batch_size,
            num_workers = core_number
        )

validation_dataloader = DataLoader(
            validation,
            sampler = SequentialSampler(validation), 
            batch_size = batch_size,
            num_workers = core_number
        )

In [19]:
from transformers import BertForSequenceClassification

bert_model = BertForSequenceClassification.from_pretrained(
    "bert-large-uncased",  # bert-base-uncased
    num_labels=2,
               
    output_attentions=False, 
    output_hidden_states=False, 
)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [20]:
from transformers import AdamW

adamw_optimizer = AdamW(bert_model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

In [21]:
from transformers import get_linear_schedule_with_warmup

epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(adamw_optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [22]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
def fit_batch(dataloader, model, optimizer, epoch):
    total_train_loss = 0
    
    for batch in tqdm(dataloader, desc=f"Training epoch:{epoch}", unit="batch"):
        input_ids, attention_masks, token_type_ids, labels = batch

        model.zero_grad()
        
        loss = model(input_ids, 
                             token_type_ids=token_type_ids, 
                             attention_mask=attention_masks, 
                             labels=labels)
        loss = loss['loss']
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
        
    return total_train_loss

In [24]:
import numpy

from sklearn.metrics import accuracy_score




def eval_batch(dataloader, model, metric=accuracy_score):
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions , predicted_labels = [], []
    
    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        input_ids, attention_masks, token_type_ids, labels = batch
        

        with torch.no_grad():
            loss = model(input_ids, 
                                   token_type_ids=token_type_ids, 
                                   attention_mask=attention_masks,
                                   labels=labels)
            logits = loss['logits']
            loss = loss['loss']
        total_eval_loss += loss.item()
        
        y_pred = numpy.argmax(logits.detach().numpy(), axis=1).flatten()
        total_eval_accuracy += metric(labels, y_pred)
        
        predictions.extend(logits.detach().numpy().tolist())
        predicted_labels.extend(y_pred.tolist())
    
    return total_eval_accuracy, total_eval_loss, predictions ,predicted_labels

In [25]:
import random

seed_val = 42

random.seed(seed_val)
numpy.random.seed(seed_val)
torch.manual_seed(seed_val)


def train(train_dataloader, validation_dataloader, model, optimizer, epochs):
    training_stats = []
    
    total_t0 = time.time()
    
    for epoch in range(0, epochs):
        
        t0 = time.time()
        
        total_train_loss = 0
        
        model.train()
        
        total_train_loss = fit_batch(train_dataloader, model, optimizer, epoch)
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        
        training_time = format_time(time.time() - t0)
        
        t0 = time.time()
        
        model.eval()
        
        total_eval_accuracy, total_eval_loss, _, _ = eval_batch(validation_dataloader, model)
        
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        
        print(f"  Accuracy: {avg_val_accuracy}")
    
        avg_val_loss = total_eval_loss / len(validation_dataloader)
    
        validation_time = format_time(time.time() - t0)
    
        print(f"  Validation Loss: {avg_val_loss}")
    
        training_stats.append(
            {
                'epoch': epoch,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
        

    print("")
    print("Training complete!")

    print(f"Total training took {format_time(time.time()-total_t0)}")
    return training_stats

In [None]:
training_stats = train(train_dataloader, validation_dataloader, bert_model, adamw_optimizer, epochs)

In [None]:
df_stats = pandas.DataFrame(training_stats).set_index('epoch')
df_stats

In [None]:
from matplotlib import pyplot

%matplotlib inline



pyplot.plot(df_stats['Training Loss'], 'b-o', label="Training")
pyplot.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
pyplot.title("Training & Validation Loss")
pyplot.xlabel("Epoch")
pyplot.ylabel("Loss")
pyplot.legend()
pyplot.xticks(df_stats.index.values.tolist())
pyplot.show()

# Performance On Test Set

In [29]:
test = convert_to_dataset_torch(X_test, y_test)
test_dataloader = DataLoader(test,  sampler=SequentialSampler(test), batch_size=batch_size)

100%|██████████| 600/600 [00:00<00:00, 1860.58it/s]


In [30]:
bert_model.eval()

_, _,_ ,predicted_labels = eval_batch(test_dataloader, bert_model)

Evaluating: 100%|██████████| 75/75 [18:11<00:00, 14.56s/batch]


In [31]:
from pathlib import Path



output_dir = Path("__file__").parents[0].absolute().joinpath("bert_large")
output_dir.mkdir(exist_ok=True)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = bert_model.module if hasattr(bert_model, 'module') else bert_model 
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(str(output_dir.absolute()))

('/content/gdrive/My Drive/[]Nhan Dang/Question-Similarity/bert_large/tokenizer_config.json',
 '/content/gdrive/My Drive/[]Nhan Dang/Question-Similarity/bert_large/special_tokens_map.json',
 '/content/gdrive/My Drive/[]Nhan Dang/Question-Similarity/bert_large/vocab.txt',
 '/content/gdrive/My Drive/[]Nhan Dang/Question-Similarity/bert_large/added_tokens.json',
 '/content/gdrive/My Drive/[]Nhan Dang/Question-Similarity/bert_large/tokenizer.json')