In [1]:
import comet_ml

In [2]:
import os
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/mnt/Research/peter-research/peter_devine_nlp_models"
os.environ["PYTORCH_TRANSFORMERS_CACHE"] = "/mnt/Research/peter-research/peter_devine_nlp_models"
os.environ['COMET_API_KEY'] = "cQ08Sxaq3jBSiwFrzNdIuOCv5"

In [3]:
from downloader import download_datasets
from dataset_preparation import prepare_datasets

In [4]:
# 'chen_2014', 'ciurumelea_2017', 'di_sorbo_2016', 'guzman_2015', 'maalej_2016', 'scalabrino_2017', 'tizard_2019', 'williams_2017'

dataset_list = ['chen_2014', 'ciurumelea_2017', 'di_sorbo_2016', 'guzman_2015', 'maalej_2016', 'scalabrino_2017', 'tizard_2019', 'williams_2017']

dataset_list = ['di_sorbo_2016', 'guzman_2015', 'maalej_2016', 'scalabrino_2017', 'tizard_2019', 'williams_2017']
label_granularity = "bug_feature_other"
is_multiclass = not (label_granularity == "requirements_relevance")
problem_type = "multi_label_classification" if is_multiclass else "single_label_classification"
model_name = "distilbert-base-uncased"

In [7]:
# download_datasets(dataset_list, label_granularity=label_granularity)

Downloading di_sorbo_2016
Downloading guzman_2015
Downloading maalej_2016
Downloading scalabrino_2017
Downloading tizard_2019
Downloading williams_2017


In [8]:
train_dataset, val_dataset, test_dataset, label_order = prepare_datasets(dataset_list, label_granularity, model_name, is_multiclass)

di_sorbo_2016
guzman_2015
maalej_2016
scalabrino_2017
tizard_2019
williams_2017


In [9]:
label_order = list(label_order)
label_order.remove("text")
label_order

['bug', 'feature', 'other']

In [10]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
import torch
from sklearn.metrics import brier_score_loss, accuracy_score, precision_recall_fscore_support, ndcg_score, roc_auc_score
import numpy as np

def get_aprf1(labels, pred_label):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred_label, average='binary', zero_division=0)
    acc = accuracy_score(labels, pred_label)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def get_cutoff_metrics(labels, preds, cutoff):
    pred_label = preds > cutoff
    metrics = get_aprf1(labels, pred_label)
    return {f"{k}_{cutoff}": v for k, v in metrics.items()}

def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    
    results = {}
    
    if is_multiclass:
        sig = torch.nn.Sigmoid()
        perc_preds = sig(torch.Tensor(logits)).numpy()
        num_pred_classes = logits.shape[1]
        for class_num in range(num_pred_classes):
            class_name = label_order[class_num]

            class_perc_preds = perc_preds[:, class_num]
            class_labels = labels[:, class_num]

            results[f"{class_name}_roc_auc"] = roc_auc_score(class_labels, class_perc_preds)
    #         for i in range(1,10):
    #             cutoff_metrics = get_cutoff_metrics(class_labels, class_perc_preds, i / 10)
    #             cutoff_metrics = {f"{class_name}_{k}":v for k, v in cutoff_metrics.items()}
    #             results.update(cutoff_metrics)

            cutoff_metrics = get_cutoff_metrics(class_labels, class_perc_preds, 0.5)
            cutoff_metrics = {f"{class_name}_{k}":v for k, v in cutoff_metrics.items()}
            results.update(cutoff_metrics)
    else:
        
        predictions = np.argmax(logits, axis=-1)
        results = get_aprf1(labels, predictions)
        
    return results

training_args = TrainingArguments(
    output_dir=f'./results_{label_granularity}',          # output directory
    num_train_epochs=6,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    evaluation_strategy="epoch",
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f'./logs_{label_granularity}',            # directory for storing logs
    metric_for_best_model="loss",
    logging_steps=10,
    greater_is_better=False,
    load_best_model_at_end=True,
    fp16=True,
)

config = AutoConfig.from_pretrained(model_name, cache_dir="/mnt/Research/peter-research/peter_devine_nlp_models")
num_labels = 3 if label_granularity == "bug_feature_other" else 2
config.num_labels = num_labels
config.problem_type = problem_type
model = AutoModelForSequenceClassification.from_config(config)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/peter-devine/huggingface/e02bc8a98f0a46888e667f9971a25330



Epoch,Training Loss,Validation Loss,Bug Roc Auc,Bug Accuracy 0.5,Bug F1 0.5,Bug Precision 0.5,Bug Recall 0.5,Feature Roc Auc,Feature Accuracy 0.5,Feature F1 0.5,Feature Precision 0.5,Feature Recall 0.5,Other Roc Auc,Other Accuracy 0.5,Other F1 0.5,Other Precision 0.5,Other Recall 0.5
1,0.4364,0.405107,0.871533,0.84069,0.59298,0.690435,0.519634,0.790164,0.82169,0.2375,0.629139,0.146379,0.858486,0.780181,0.830935,0.834312,0.827586
2,0.3335,0.39859,0.909302,0.86349,0.65687,0.748744,0.585079,0.84549,0.843905,0.371765,0.78607,0.243451,0.877188,0.789535,0.849937,0.794932,0.913121
3,0.2657,0.366095,0.924498,0.87109,0.66103,0.800745,0.562827,0.858028,0.842736,0.568218,0.592965,0.545455,0.884861,0.799474,0.846875,0.844237,0.84953
4,0.231,0.376468,0.925003,0.881906,0.704246,0.799003,0.629581,0.8633,0.839228,0.600291,0.568088,0.636364,0.883556,0.796551,0.846493,0.833985,0.859382
5,0.1991,0.394284,0.923603,0.875183,0.723266,0.716303,0.730366,0.85847,0.845075,0.578696,0.597701,0.560863,0.883792,0.802397,0.846851,0.856946,0.836991
6,0.1777,0.411081,0.922434,0.881029,0.719504,0.759825,0.683246,0.857836,0.841567,0.590015,0.579495,0.600924,0.884806,0.804151,0.849641,0.851552,0.847738




In [9]:
from dataset_downloaders.label_mappings import relevance_dataset_mappings

relevance_dataset_mappings.keys()

dict_keys(['chen_2014', 'ciurumelea_2017', 'di_sorbo_2016', 'guzman_2015', 'maalej_2016', 'scalabrino_2017', 'tizard_2019', 'williams_2017'])

In [12]:
import pandas as pd
import numpy as np

In [14]:
np.argmax(pd.read_csv(os.path.join("./data", f"guzman_2015_requirements_relevance.csv")).drop("text", axis=1).values, axis=1)

array([1, 0, 1, ..., 0, 0, 1])