https://github.com/theo2023/coco-bert-longformer

constants.py

In [None]:
# !pip install torch
# !pip install pandas
# !pip install transformers
# !pip install datasets
# !pip install scikit-learn

Pulling data from dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil, os
import torch, time, sys, random, argparse
import pandas as pd
import numpy as np
import torch.distributed as dist
import torch.multiprocessing as mp
import argparse

from torch.nn.utils import clip_grad_norm_
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, LongformerTokenizer, LongformerForSequenceClassification, logging
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dest_path = '/content/drive/MyDrive/public-inconsistency-detection-data'

In [None]:
MAX_EPOCHS = 5
TOLERANCE = 10
BATCH_SIZE = 4
LEARNING_RATE = 1E-5
NUM_CLASSES = 2
DEFAULT_SEED = 12
MAX_LEN = 1024
ACCUM_ITERS = 8
NUM_GPUS = 8

dataset.py

In [None]:
class CocoDataset(Dataset):
  def __init__(self, df):
    logging.set_verbosity_error()
    self.df = df
    self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
    self.data = self.load_data(self.df)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    return self.data[index]

  def load_data(self, df):
      token_ids = []
      mask_ids = []
      seg_ids = []
      labels = []

      code_list = df['span_diff_code_subtokens'].to_list()
      comment_list = df['old_comment_raw'].to_list()
      label_list = df['label'].to_list()

      for (code, comment, label) in zip(code_list, comment_list, label_list):
        code_id = self.tokenizer.encode(code, add_special_tokens=False, truncation=True, max_length=MAX_LEN)
        comment_id = self.tokenizer.encode(comment, add_special_tokens=False, truncation=True, max_length=MAX_LEN)

        pair_token_ids = [self.tokenizer.cls_token_id] + comment_id + [self.tokenizer.sep_token_id] + code_id + [self.tokenizer.sep_token_id]
        pair_token_ids = self.truncate(pair_token_ids)
        code_len = len(code_id)
        comment_len = len(comment_id)

        attention_mask_ids = torch.tensor([1] * (code_len + comment_len + 3))
        segment_ids = torch.tensor([0] * (code_len + comment_len + 3))

        attention_mask_ids = self.truncate(attention_mask_ids)
        segment_ids = self.truncate(segment_ids)

        token_ids.append(torch.tensor(pair_token_ids))
        mask_ids.append(attention_mask_ids)
        seg_ids.append(segment_ids)
        labels.append(label)

      token_ids = pad_sequence(token_ids, batch_first=True)
      mask_ids = pad_sequence(mask_ids, batch_first=True)
      seg_ids = pad_sequence(seg_ids, batch_first=True)
      labels = torch.tensor(labels)

      dataset = TensorDataset(token_ids, mask_ids, seg_ids, labels)
      return dataset

  def truncate(self, ids):
    return ids[:MAX_LEN] if len(ids) > MAX_LEN else ids

def retrieve_train_data():
  train_param = pd.read_json(os.path.join(dest_path, "Param", "train.json"))
  train_return = pd.read_json(os.path.join(dest_path, "Return", "train.json"))
  train_summary = pd.read_json(os.path.join(dest_path, "Summary", "train.json"))
  train_df = pd.concat([train_param, train_return, train_summary], axis=0)
  return train_df

def retrieve_valid_data():
  valid_param = pd.read_json(os.path.join(dest_path, "Param", "valid.json"))
  valid_return = pd.read_json(os.path.join(dest_path, "Return", "valid.json"))
  valid_summary = pd.read_json(os.path.join(dest_path, "Summary", "valid.json"))
  valid_df = pd.concat([valid_param, valid_return, valid_summary], axis=0)
  return valid_df

def retrieve_test_data():
  test_param = pd.read_json(os.path.join(dest_path, "Param", "test.json"))
  test_return = pd.read_json(os.path.join(dest_path, "Return", "test.json"))
  test_summary = pd.read_json(os.path.join(dest_path, "Summary", "test.json"))
  test_df = pd.concat([test_param, test_return, test_summary], axis=0)
  return test_df

metrics.py

In [None]:
def compute_metrics(predicted_labels, true_labels):
  predicted_labels = [label.item() for label in predicted_labels]
  gold_labels = [label.item() for label in true_labels]

  assert len(predicted_labels) == len(gold_labels)

  accuracy = accuracy_score(gold_labels, predicted_labels)
  precision = precision_score(gold_labels, predicted_labels, zero_division=0)
  recall = recall_score(gold_labels, predicted_labels, zero_division=0)
  f1 = f1_score(gold_labels, predicted_labels, zero_division=0)

  return {'precision': precision, 'recall': recall, 'f1': f1, 'acc': accuracy}

model.py

In [None]:
def get_model():
  logging.set_verbosity_error()
  model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=NUM_CLASSES)
  return model

train.py

In [None]:
# check gpu availability
print(torch.cuda.is_available())
print(torch.cuda.device_count())
torch.cuda.current_device()
torch.cuda.device(0)
torch.cuda.get_device_name(0)

True
1


'Tesla V100-SXM2-16GB'

In [None]:
def train(gpu, args):
    # rank = args.nr * args.gpus + gpu
    # print("rank: ", rank)
    # dist.init_process_group(backend="nccl", init_method="env://", world_size=args.world_size, rank=rank)

    print("set hyperparameters")

    torch.manual_seed(args.seed)
    device = torch.device('cuda', gpu)
    classifier = args.model.to("cuda")
    # torch.cuda.set_device(gpu)
    # classifier.to(gpu)

    print("set optimizer and model")

    # classifier = DistributedDataParallel(classifier, device_ids=[gpu], find_unused_parameters=False)
    classifier = torch.nn.DataParallel(classifier) # only for Longformer
    optimizer = torch.optim.Adam(classifier.parameters(), lr=LEARNING_RATE)

    print("retrieve training data")

    train_data = CocoDataset(retrieve_train_data())
    valid_data = CocoDataset(retrieve_valid_data())
    # train_sampler = DistributedSampler(train_data, num_replicas=args.world_size, rank=rank)
    # valid_sampler = DistributedSampler(valid_data, num_replicas=args.world_size, rank=rank)
    train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)
    valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)

    patience = 0
    best_valid_f1 = 0.0

    print("start training")

    for epoch in range(MAX_EPOCHS):
        if patience >= TOLERANCE and gpu == 0:
            print(f"Validation F1 did not improve for {TOLERANCE} epochs. Terminating training.")
            break

        if gpu == 0:
          start = time.time()

        classifier.train()
        train_loss = 0.0
        predictions = []
        gold_labels = []

        print("First time training")
        # print("Total iters: ", len(train_loader))
        for batch_idx, (sequence, attention_masks, token_type_ids, labels) in enumerate(train_loader):
            # print("Batch no: ", batch_idx)
            sequence = sequence.to(device, non_blocking=True)
            attention_masks = attention_masks.to(device, non_blocking=True)
            token_type_ids = token_type_ids.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            # classifier inherits from nn.Module, so this is a call to forward()
            optimizer.zero_grad()
            model_output = classifier(sequence, attention_mask=attention_masks, token_type_ids=token_type_ids, labels=labels)

            loss = model_output.loss
            prediction = model_output.logits

            loss /= ACCUM_ITERS
            train_loss +=  loss.item()
            prediction = torch.argmax(prediction, dim=-1)

            loss.backward()
            clip_grad_norm_(classifier.parameters(), 1.0)

            if ((batch_idx + 1) % ACCUM_ITERS == 0) or (batch_idx + 1 == len(train_loader)):
                optimizer.step()
                optimizer.zero_grad()

            predictions.extend(prediction)
            gold_labels.extend(labels)

        train_loss /= len(train_loader)
        train_metrics = compute_metrics(predictions, gold_labels)
        train_precision = train_metrics['precision']
        train_recall = train_metrics['recall']
        train_f1 = train_metrics['f1']
        train_acc = train_metrics['acc']

        print("Train f1: ", train_f1)
        print("Train acc: ", train_acc)
        print("Train precision: ", train_precision)
        print("Train recall: ", train_recall)

        classifier.eval()
        valid_loss = 0.0
        predictions = []
        gold_labels = []

        print("Validation training")

        with torch.no_grad():
            for batch_idx, (sequence, attention_masks, token_type_ids, labels) in enumerate(valid_loader):
                # print("Batch id: ", batch_idx)
                sequence = sequence.to(device, non_blocking=True)
                attention_masks = attention_masks.to(device, non_blocking=True)
                token_type_ids = token_type_ids.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)

                model_output = classifier(sequence, attention_mask=attention_masks, token_type_ids=token_type_ids, labels=labels)
                loss = model_output.loss
                prediction = model_output.logits

                valid_loss += loss.item()
                prediction = torch.argmax(prediction, dim=-1)

                predictions.extend(prediction)
                gold_labels.extend(labels)

        valid_loss /= len(valid_loader)
        valid_metrics = compute_metrics(predictions, gold_labels)
        valid_precision = valid_metrics['precision']
        valid_recall = valid_metrics['recall']
        valid_f1 = valid_metrics['f1']
        valid_acc = valid_metrics['acc']

        print("Valid f1: ", valid_f1)
        print("Valid acc: ", valid_acc)
        print("Valid precision: ", valid_precision)
        print("Valid recall: ", valid_recall)

        if valid_f1 > best_valid_f1:
            best_valid_f1 = valid_f1
            patience = 0 # reset
            print(f"New best validation F1 of {valid_f1:.3f}. Saving model.")
            torch.save(classifier.module.state_dict(), args.path)
        else:
            patience += 1

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        min, sec = divmod(rem, 60)

        print(f"Epoch {epoch + 1}: train_loss: {train_loss:.3f} train_precision: {train_precision:.3f} train_recall: {train_recall:.3f} train_f1: {train_f1:.3f} train_acc: {train_acc:.3f}")
        print(f"\t valid_loss: {valid_loss:.3f} valid_precision: {valid_precision:.3f} valid_recall: {valid_recall:.3f} valid_f1: {valid_f1:.3f} valid_acc: {valid_acc:.3f}")
        print("\t {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(min), sec))

    # sys.exit()

main program (train.py)

In [None]:
def train_main(seed=None, path=None, nodes=None, gpus=None, nr=None):
  torch.cuda.empty_cache()
  print(f"Effective batch size: {BATCH_SIZE * ACCUM_ITERS} Learning rate: {LEARNING_RATE}")

  parser = argparse.ArgumentParser()
  # parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
  #                     help='number of data loading workers (default: 4)')
  # parser.add_argument('-g', '--gpus', default=1, type=int,
  #                     help='number of gpus per node')
  # parser.add_argument('-nr', '--nr', default=0, type=int,
  #                     help='ranking within the nodes')
  # parser.add_argument('--epochs', default=2, type=int, metavar='N',
  #                     help='number of total epochs to run')
  # parser.add_argument('--path', default='.', type=str, metavar='N',
  #                     help='path to the model')
  args = parser.parse_args(args=[])

  # args = Namespace()
  args.seed = DEFAULT_SEED if seed is None else int(seed)
  args.nodes = 1 if nodes is None else int(nodes)
  args.gpus = NUM_GPUS if gpus is None else int(gpus)
  args.nr = 0 if nr is None else int(nr)
  args.path = '.' if path is None else str(path)

  # args.world_size = args.gpus * args.nodes
  # args.rank = args.nr * args.gpus
  # args = parser.parse_args()

  # os.environ['MASTER_ADDR'] = "127.0.0.1"
  # os.environ['MASTER_PORT'] = "8888"

  seed = args.seed
  torch.manual_seed(seed)
  # torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)

  print(80 * "=")
  print("TRAINING")
  print(80 * "=")
  args.model = get_model()
  # print(args)
  # print(f"Value of train: {train}, type of train: {type(train)}")
  # mp.spawn(train, nprocs=args.gpus, args=(args,))
  train(0, args)

eval.py

In [None]:
def test(classifier, test_loader, device):
  classifier.eval()
  test_loss = 0.0
  predictions = []
  gold_labels = []

  with torch.no_grad():
    for batch_idx, (sequence, attention_masks, token_type_ids, labels) in enumerate(test_loader):
      sequence = sequence.to(device)
      attention_masks = attention_masks.to(device)
      token_type_ids = token_type_ids.to(device)
      labels = labels.to(device)

      model_output = classifier(sequence, attention_mask=attention_masks, token_type_ids=token_type_ids, labels=labels)
      loss, prediction = model_output.loss, model_output.logits
      test_loss += loss.item()
      prediction = torch.argmax(prediction, dim=-1)

      predictions.extend(prediction)
      gold_labels.extend(labels)

  test_loss /= len(test_loader)
  test_metrics = compute_metrics(predictions, gold_labels)
  test_acc, test_precision, test_f1, test_recall = test_metrics['acc'], test_metrics['precision'], test_metrics['f1'], test_metrics['recall']

  # test_precision = test_metrics['precision']
  # test_recall = test_metrics['recall']
  # test_f1 = test_metrics['f1']
  # test_accuracy = test_metrics['accuracy']

  print(f"test_loss: {test_loss:.3f} test_precision: {test_precision:.3f} test_recall: {test_recall:.3f} test_f1: {test_f1:.3f} test_acc: {test_acc:.3f}")


In [None]:
def eval_main(seed, path):
  parser = argparse.ArgumentParser()
  # parser.add_argument('--path', default=".", type=str)
  # parser.add_argument('--seed', default=DEFAULT_SEED, type=int)
  args = parser.parse_args(args=[])
  args.seed = DEFAULT_SEED if seed is None else int(seed)
  args.path = '.' if path is None else str(path)

  seed = args.seed
  path = args.path

  torch.manual_seed(seed)
  # torch.cuda_manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)

  classifier = get_model()

  print(80 * "=")
  print("TESTING")
  print(80 * "=")

  test_df = retrieve_test_data()
  test_data = CocoDataset(test_df)
  test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=True)

  print("Restoring the best model weights")
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  classifier.load_state_dict(torch.load(args.path), strict=False)
  classifier.to(device)
  print("Final evaluation to test set")
  test(classifier, test_loader, device)

In [None]:
seeds = [12, 17, 22]

In [None]:
for seed in seeds:
  print("Seed", seed)
  path = os.path.join(dest_path, f"results/longformer_jit_cm_bs64lr1e-5_{seed}")
  os.makedirs(path, exist_ok=True)
  new_path = os.path.join(path, "model.weights")
  print(new_path)
  train_main(seed=seed, path=new_path)
  # eval_main(seed=seed, path=os.path.join(path, "model.weights"))

Seed 12
/content/drive/MyDrive/public-inconsistency-detection-data/results/longformer_jit_cm_bs64lr1e-5_12/model.weights


NameError: name 'train_main' is not defined

In [None]:
path = os.path.join(dest_path, f"results/longformer_jit_cm_bs64lr1e-5_{seeds[0]}")
new_path = os.path.join(path, "model.weights")
eval_main(seed=seeds[0], path=new_path)

TESTING
Restoring the best model weights
Final evaluation to test set
test_loss: 0.551 test_precision: 0.801 test_recall: 0.633 test_f1: 0.707 test_acc: 0.738
