In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import shutil
import sys   
from glob import glob
import re
import os
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [2]:
import sys
import datetime
from threading import Lock

from prettytable import PrettyTable

def log(str):
    print(str, file=sys.stderr)
    sys.stderr.flush()

class ResultWriter:
    def __init__(self, results_filename):
        self.results_filename = results_filename
        self.lock = Lock()

    def write(self, str):
        self.lock.acquire()
        try:
            with open(self.results_filename +'.txt', "a", encoding="utf-8") as f:
                f.write(str + "\n")
        finally:
            self.lock.release()

    def log(self, msg):
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
        msg = timestamp + ": " + msg
        log(msg)
        self.lock.acquire()
        try:
            with open(self.results_filename + ".log", "a", encoding="utf-8") as f:
                f.write(msg + "\n")
        finally:
            self.lock.release()


def get_num_model_parameters(model):
    return sum([p.numel() for p in model.parameters() if p.requires_grad])


def print_model_parameters(model,rw):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, f'{param:,}'])
        total_params += param
    rw.log(f'{table}')
    rw.log(f"Total Trainable Params: {total_params}")
    return total_params

rw = ResultWriter('./logs/CNN_5_10_bert_append_sentence')

In [3]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len,ohe):
        self.tokenizer = tokenizer
        self.df = df
        self.ohe = ohe
        self.title = df['text']
        self.targets = self.ohe.transform(np.array(self.df.label.values).reshape(-1,1)).toarray()
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        #title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [4]:
class BERTClass(torch.nn.Module):
    def __init__(self,pre_trained):
        super(BERTClass, self).__init__()
        self.bert_model = AutoModel.from_pretrained(pre_trained)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(768, 13)
        self.softmax = torch.nn.Softmax(dim=1)
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear1(output_dropout)
        return output

In [5]:
class GlobalMaxPooling1D(torch.nn.Module):
    def __init__(self, data_format='channels_last'):
        super(GlobalMaxPooling1D, self).__init__()
        self.data_format = data_format
        self.step_axis = 1 if self.data_format == 'channels_last' else 2

    def forward(self, input):
        return torch.max(input, axis=self.step_axis).values

class BERTCNN(torch.nn.Module):
    def __init__(self,pre_trained):
        super(BERTCNN, self).__init__()
        self.bert_model = AutoModel.from_pretrained(pre_trained)
        
        self.conv1 = torch.nn.Conv1d(256, 128, kernel_size=5)
        self.conv2 = torch.nn.Conv1d(256, 128, kernel_size=10)
        self.mp = GlobalMaxPooling1D('channels_first')
        self.bn = torch.nn.BatchNorm1d(128)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear1 = torch.nn.Linear(128, 64)
        self.linear2 = torch.nn.Linear(128, 64)
        self.linear3 = torch.nn.Linear(128,13)
        self.softmax = torch.nn.Softmax(dim=1)
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        X = self.conv1(output.last_hidden_state)
        X = self.mp(X)
        #print(X.shape)
        X = self.bn(X)
        #print(X.shape)
        X = F.relu(self.linear1(X))
        Y = self.conv2(output.last_hidden_state)
        Y = self.mp(Y)
        Y = self.bn(Y)
        Y = F.relu(self.linear2(Y))
        concat = torch.concat((X,Y),dim=1)
        output = self.dropout(concat)
        output = self.linear3(output)
        #output_dropout = self.dropout(output.pooler_output)
        #output = self.linear(output_dropout)
        return output

In [6]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [7]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
  val_targets = []
  val_outputs = []

  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
   
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('# Epoch {}: #'.format(epoch),end='\t')
    for batch_idx, data in enumerate(tqdm(training_loader)):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device,dtype=torch.float)
        
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)#focal_loss(outputs, targets)
        #print(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(tqdm(validation_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
      
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      rw.log('epoch:{:.6f} Avg Training Loss: {:.6f} \tAvg Validation Loss: {:.6f}'.format(epoch, train_loss, valid_loss))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
      # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        rw.log('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('\t Done\n'.format(epoch))

  return model

In [8]:
#it'll return true and predicted labels
def predict(data_loader,model):
    target_list = []
    output_list = []
    model.eval()
    with torch.no_grad():
        for batch_idx, data in enumerate(tqdm(data_loader, 0)):
          ids = data['input_ids'].to(device, dtype = torch.long)
          mask = data['attention_mask'].to(device, dtype = torch.long)
          token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
          targets = data['targets'].to(device, dtype = torch.float)
          outputs = model(ids, mask, token_type_ids)
          target_list.extend(targets.cpu().detach().numpy().tolist())
          output_list.extend(F.softmax(outputs,dim=1).cpu().detach().numpy().tolist())
    return np.array(target_list).argmax(1),np.array(output_list).argmax(1)

In [9]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [10]:
pretrained_name = "bert-base-uncased"

# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-05

rw.log(f'pretrained_name: {pretrained_name}')
rw.log(f'MAX_LEN: {MAX_LEN}')
rw.log(f'BATCH_SIZE: {TRAIN_BATCH_SIZE}')
rw.log(f'EPOCHS: {EPOCHS}')
rw.log(f'LEARNING RATE: {LEARNING_RATE}')

2022-04-29 18:56:35.853: pretrained_name: bert-base-uncased
2022-04-29 18:56:35.856: MAX_LEN: 256
2022-04-29 18:56:35.857: BATCH_SIZE: 8
2022-04-29 18:56:35.857: EPOCHS: 5
2022-04-29 18:56:35.858: LEARNING RATE: 1e-05


In [11]:
f = glob('./data/*.csv')
f

['./data\\train_data.csv',
 './data\\train_data_append_label.csv',
 './data\\train_data_append_sentence.csv',
 './data\\train_data_append_sentence_new_preprocess.csv',
 './data\\train_data_append_sentence_three.csv',
 './data\\val_data.csv',
 './data\\val_data_append_sentence.csv',
 './data\\val_data_append_sentence_new_preprocess.csv',
 './data\\val_data_append_sentence_three.csv']

In [12]:
train_path = f[4]#'./data/train_data_append_sentence_three.csv' 
test_path = f[8]#'./data/val_data_append_sentence.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
target_list = list(train_df.label.unique())
target_list.sort()
ohe = OneHotEncoder()
ohe.fit(np.array(target_list).reshape(-1,1))

train_df = train_df.drop(columns=['id','start','end'])
test_df = test_df.drop(columns=['id','start','end'])

train_df, val_df = train_test_split(train_df, test_size=0.2)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_name)

In [14]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN,ohe)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN,ohe)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN,ohe)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0 )
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
rw.log(f'device: {device}')

2022-04-29 18:56:40.324: device: cuda


In [16]:
classifier_name = 'linear_layer_append_sentences_three_CE_loss'
dir_path = "./trained_weights/" +pretrained_name+'/'+ classifier_name 
rw.log(f'pretrained name: {pretrained_name}')
rw.log(f'classifier name: {classifier_name}')
rw.log(f'trained weights path: {dir_path}')

if not os.path.exists(dir_path):  
  os.makedirs(dir_path)
  print(f"The new directory is created: {dir_path}")
    
ckpt_path = dir_path+"/current_checkpoint.pt"
best_model_path = dir_path+"/best_model.pt"

2022-04-29 18:56:40.331: pretrained name: bert-base-uncased
2022-04-29 18:56:40.332: classifier name: linear_layer_append_sentences_three_CE_loss
2022-04-29 18:56:40.333: trained weights path: ./trained_weights/bert-base-uncased/linear_layer_append_sentences_three_CE_loss


The new directory is created: ./trained_weights/bert-base-uncased/linear_layer_append_sentences_three_CE_loss


In [17]:
model = BERTClass(pretrained_name)
model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [35]:
##Load Model
try:
    if(os.path.exists(best_model_path)):
        model, optimizer, epoch_val, valid_loss_min = load_ckp(best_model_path, model, optimizer)
    elif(os.path.exists(ckpt_path)):
        model, optimizer, epoch_val, valid_loss_min = load_ckp(ckpt_model, model, optimizer)
except:
    print('no model exist')

In [18]:
model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

# Epoch 1: #	

100%|██████████████████████████████████████████████████████████████████████████████| 1927/1927 [04:54<00:00,  6.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 482/482 [00:22<00:00, 21.17it/s]
2022-04-29 19:02:01.351: epoch:1.000000 Avg Training Loss: 0.000093 	Avg Validation Loss: 0.000234
2022-04-29 19:02:03.158: Validation loss decreased (inf --> 0.000234).  Saving model ...


	 Done

# Epoch 2: #	

100%|██████████████████████████████████████████████████████████████████████████████| 1927/1927 [04:58<00:00,  6.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 482/482 [00:25<00:00, 19.08it/s]
2022-04-29 19:07:29.207: epoch:2.000000 Avg Training Loss: 0.000051 	Avg Validation Loss: 0.000175
2022-04-29 19:07:31.368: Validation loss decreased (0.000234 --> 0.000175).  Saving model ...


	 Done

# Epoch 3: #	

100%|██████████████████████████████████████████████████████████████████████████████| 1927/1927 [05:04<00:00,  6.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 482/482 [00:23<00:00, 20.69it/s]
2022-04-29 19:13:01.988: epoch:3.000000 Avg Training Loss: 0.000035 	Avg Validation Loss: 0.000155
2022-04-29 19:13:04.194: Validation loss decreased (0.000175 --> 0.000155).  Saving model ...


	 Done

# Epoch 4: #	

100%|██████████████████████████████████████████████████████████████████████████████| 1927/1927 [04:57<00:00,  6.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 482/482 [00:22<00:00, 21.00it/s]
2022-04-29 19:18:27.794: epoch:4.000000 Avg Training Loss: 0.000025 	Avg Validation Loss: 0.000146
2022-04-29 19:18:29.820: Validation loss decreased (0.000155 --> 0.000146).  Saving model ...


	 Done

# Epoch 5: #	

100%|██████████████████████████████████████████████████████████████████████████████| 1927/1927 [04:54<00:00,  6.54it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 482/482 [00:22<00:00, 21.03it/s]
2022-04-29 19:23:50.316: epoch:5.000000 Avg Training Loss: 0.000018 	Avg Validation Loss: 0.000133
2022-04-29 19:23:52.279: Validation loss decreased (0.000146 --> 0.000133).  Saving model ...


	 Done



In [19]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)
val_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=256, shuffle=False, num_workers=0)
train_true,train_pred = predict(train_data_loader,model)
val_true,val_pred = predict(val_data_loader,model)
train_clf = classification_report(train_true,train_pred,zero_division=True,output_dict = True,target_names=target_list)
val_clf = classification_report(val_true,val_pred,zero_division=True,target_names=target_list,output_dict=True)
rw.log(f'{"_"*30}Training Results{"_"*30}\n')
rw.log(classification_report(train_true,train_pred,zero_division=True,target_names=target_list))
rw.log(f'{"_"*30}Val Results{"_"*30}\n')
rw.log(classification_report(val_true,val_pred,zero_division=True,target_names=target_list))

100%|██████████████████████████████████████████████████████████████████████████████████| 61/61 [01:22<00:00,  1.35s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.29s/it]
2022-04-29 19:25:38.248: ______________________________Training Results______________________________

2022-04-29 19:25:38.266:                 precision    recall  f1-score   support

      ANALYSIS       0.99      0.97      0.98      5354
ARG_PETITIONER       0.94      0.96      0.95       745
ARG_RESPONDENT       0.91      0.91      0.91       351
           FAC       0.99      0.98      0.98      2790
         ISSUE       0.89      0.97      0.93       210
          NONE       0.98      0.93      0.96       899
      PREAMBLE       0.98      0.99      0.99      2515
PRE_NOT_RELIED       1.00      0.00      0.00        73
    PRE_RELIED       0.83      0.99      0.90       754
         RATIO       0.85      0.90      0.87       374
           RLC 

In [20]:
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=0)
test_true,test_pred = predict(test_data_loader,model)
test_clf = classification_report(test_true,test_pred,zero_division=True,target_names=target_list,output_dict=True)
rw.log(f'{"_"*30}Test Results{"_"*30}\n')
rw.log(classification_report(test_true,test_pred,zero_division=True,target_names=target_list))

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:15<00:00,  1.28s/it]
2022-04-29 19:25:53.636: ______________________________Test Results______________________________

2022-04-29 19:25:53.641:                 precision    recall  f1-score   support

      ANALYSIS       0.73      0.76      0.75       984
ARG_PETITIONER       0.25      0.23      0.24        70
ARG_RESPONDENT       0.38      0.61      0.47        38
           FAC       0.77      0.74      0.76       580
         ISSUE       0.70      0.76      0.73        50
          NONE       0.95      0.85      0.89       190
      PREAMBLE       0.93      0.86      0.89       508
PRE_NOT_RELIED       1.00      0.00      0.00        12
    PRE_RELIED       0.56      0.58      0.57       142
         RATIO       0.35      0.29      0.31        70
           RLC       0.39      0.47      0.42       116
           RPC       0.81      0.79      0.80        91
           STA       0.41  