In [1]:
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 24 09:47:50 2021

@author: cordeliazhu
"""

import os
os.system("pip install transformers==3.4.0")
#from transformers import  BertTokenizer, BertConfig, BertModel
from transformers import AdamW
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
import json
from collections import OrderedDict
import numpy as np
from torch.utils.data import Dataset,DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
import scipy.stats
from google.colab import drive
drive.mount('/content/gdrive')



Mounted at /content/gdrive


In [6]:
# use gpu or not
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("We are using {} device".format(device))

# get the tokneizer and configure from huggingface
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Config = BertConfig.from_pretrained('bert-base-uncased')
# we choose cls as our output method
output_method = 'cls'
# resource is limited
batch_size = 32
learning_rate = 2e-5
maxlen = 32

file_path = "/content/gdrive/My Drive/STS_SNLI/"
test_file = 'sts-test.csv'
dev_file = 'sts-dev.csv'



def snli_data(snli_path):
    data = []
    with open(snli_path) as f:
        for i in f:
            data.append(json.loads(i))
    return data

def STS_data(STS_path):
    data = []
    with open(STS_path) as f:
        for i in f:
            d = i.split("\t")
            sentence1 = d[5]
            sentence2 = d[6]
            score = float(d[4])
            data.append([sentence1,sentence2,score])
    return data
snli_file_path = "/content/gdrive/My Drive/STS_SNLI/"
snli_train_file = 'snli_1.0_trainproceed.txt'
# save our pre-trained model
save_path = "/content/gdrive/My Drive/STS_SNLI/pytorch_model.bin"

snil_vocab = snli_data(os.path.join(snli_file_path, snli_train_file))
# shuffle the snil train data
np.random.shuffle(snil_vocab)
# load the test data
test_data = STS_data(os.path.join(file_path, test_file))
# load the dev data
dev_data = STS_data(os.path.join(file_path, dev_file))


class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, maxlen, transform=None, target_transform=None):
        self.data = data
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.transform = transform
        self.target_transform = target_transform

    def text_to_id(self, json_data):
      # get the origin from the txt file
        origin = json_data['origin']
        entailment = json_data['entailment']
        contradiction = json_data['contradiction']
        sample = self.tokenizer([origin,entailment,contradiction], max_length=self.maxlen, truncation=True, padding='max_length', return_tensors='pt')
        return sample

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.text_to_id(self.data[idx])


class TestDataset:
    def __init__(self, data, tokenizer, maxlen):
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.traget_idx = self.text_to_id([x[0] for x in data])
        self.source_idx = self.text_to_id([x[1] for x in data])
        # get the score from float to int
        self.label_list = [int(x[2]) for x in data]
        assert len(self.traget_idx['input_ids']) == len(self.source_idx['input_ids'])

    def text_to_id(self,source):
    
      # repeated twice the source could make label matrix more clear
        sample = self.tokenizer(source, max_length = self.maxlen, truncation=True, padding = 'max_length', return_tensors='pt')
        return sample

    def get_data(self):
        return self.traget_idx, self.source_idx, self.label_list


class NeuralNetwork(nn.Module):
    def __init__(self, output_method):
        super(NeuralNetwork, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased',config=Config)
        self.output_method = output_method
        assert output_method in ['cls','pooler']

    def forward(self, input_ids, attention_mask, token_type_ids):
        x1 = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        if self.output_method == 'cls':
            last_hidden_state = x1[0]
            output = last_hidden_state[:,0]
       
        return output

model = NeuralNetwork(output_method).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

training_data = TrainDataset(snil_vocab, tokenizer, maxlen)
train_dataloader = DataLoader(training_data, batch_size = batch_size)

testing_data = TestDataset(test_data, tokenizer, maxlen)
deving_data = TestDataset(dev_data, tokenizer, maxlen)

# measures the linear relationship between two datasets. 
# > 0.8 strong relation
def compute_corrcoef(x, y):
    return scipy.stats.spearmanr(x, y).correlation
# infoNCE loss
def compute_loss(y_pred, lamda=0.05):
    row = torch.arange(0,y_pred.shape[0],3,device='cuda')
    col = torch.arange(y_pred.shape[0], device='cuda')
    # we need to mask the diagnal because it must be 1 do not have loss
    col = torch.where(col % 3 != 0)[0].cuda()
    # we need to divide our label into odd and even
    y_true = torch.arange(0,len(col),2,device='cuda')
    # caculate score and loss
    similarities = F.cosine_similarity(y_pred.unsqueeze(1), y_pred.unsqueeze(0), dim=2)
    # use python default fast cacuation of matrix
    similarities = torch.index_select(similarities, 0, row)
     # get x follow by row
    similarities = torch.index_select(similarities, 1, col)
    # lambda is the tempature
    similarities = similarities / lamda
    # cross_entropy，cross_entropy_loss!!! the key point for contrastive learning--- let positive sample clse let negative samples away
    loss = F.cross_entropy(similarities, y_true)
    return torch.mean(loss)

def test(test_data,model):
    traget_idx, source_idx, label_list = test_data.get_data()

# To perform inference without Gradient Calculation.
# To make sure there's no leak test data into the model.
    with torch.no_grad():
        # get input ids
        traget_ids = traget_idx['input_ids'].to(device)
        source_ids = source_idx['input_ids'].to(device)
        # get attention
        traget_attention_mask = traget_idx['attention_mask'].to(device)
        source_attention_mask = source_idx['attention_mask'].to(device)
        # get token types
        traget_token_type_ids = traget_idx['token_type_ids'].to(device)
        source_token_type_ids = source_idx['token_type_ids'].to(device)
        # get pred result
        traget_pred = model(traget_ids,traget_attention_mask,traget_token_type_ids)
        source_pred = model(source_ids,source_attention_mask,source_token_type_ids)
        # list of labels
        similarity = F.cosine_similarity(traget_pred,source_pred)
        similarity = similarity.cpu().numpy()
        label = np.array(label_list)
        corrcoef = compute_corrcoef(label,similarity)
    return corrcoef

def train(dataloader,testdata, model, optimizer):
    model.train()
    size = len(dataloader.dataset)
    max_corrcoef = 0
    stop_increase_n = 0
    for batch, data in enumerate(dataloader):
        input_ids = data['input_ids'].view(len(data['input_ids'])*3,-1).to(device)
        attention_mask = data['attention_mask'].view(len(data['attention_mask'])*3,-1).to(device)
        token_type_ids = data['token_type_ids'].view(len(data['token_type_ids'])*3,-1).to(device)
        pred = model(input_ids,attention_mask,token_type_ids)
        loss = compute_loss(pred)
        # zero grad otherwise cuda out of resource
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch % 10 == 0:
            loss, current = loss.item(), batch * int(len(input_ids)/3)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            model.eval() 
            corrcoef = test(testdata,model)
            model.train()
            print(f"corrcoef for test: {corrcoef:>4f}")

            if corrcoef > max_corrcoef:
                stop_increase_n = 0
                max_corrcoef = max(corrcoef, max_corrcoef)
                torch.save(model.state_dict(),save_path)
                print(f"Current corrcoef is: {(max_corrcoef):>4f}%, saved PyTorch Model to model.bin")
            else:
                # early stop
                stop_increase_n += 1
                if stop_increase_n > 10:
                    print(f"Corrcoef didn't increase for 10 batch, next epoch beigns" )
                    break

if __name__ == '__main__':
    epochs = 1
    for t in range(epochs):
        print(f"Epoch {t + 1} begins\n-------------------------------")
        train(train_dataloader,testing_data, model, optimizer)
    print("Training step has finished")

    print("Deving step has started")
    save_path = "/content/gdrive/My Drive/STS_SNLI/pytorch_model.bin"
    model.load_state_dict(torch.load(save_path))
    corrcoef = test(deving_data,model)
    print(f"dev corrcoef is: {corrcoef:>4f}")

We are using cuda device
Epoch 1 begins
-------------------------------
loss: 4.058414  [    0/149145]
corrcoef for test: 0.208167
Higher corrcoef: 0.208167%, Saved PyTorch Model State to model.bin
loss: 2.242266  [  320/149145]
corrcoef for test: 0.641226
Higher corrcoef: 0.641226%, Saved PyTorch Model State to model.bin
loss: 1.159838  [  640/149145]
corrcoef for test: 0.700999
Higher corrcoef: 0.700999%, Saved PyTorch Model State to model.bin
loss: 1.454275  [  960/149145]
corrcoef for test: 0.711446
Higher corrcoef: 0.711446%, Saved PyTorch Model State to model.bin
loss: 0.998149  [ 1280/149145]
corrcoef for test: 0.722839
Higher corrcoef: 0.722839%, Saved PyTorch Model State to model.bin
loss: 0.739120  [ 1600/149145]
corrcoef for test: 0.730639
Higher corrcoef: 0.730639%, Saved PyTorch Model State to model.bin
loss: 0.759683  [ 1920/149145]
corrcoef for test: 0.734833
Higher corrcoef: 0.734833%, Saved PyTorch Model State to model.bin
loss: 0.999541  [ 2240/149145]
corrcoef for te