In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig, AutoTokenizer

comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
df = pd.read_csv("final_data_BACKUP (4).csv")

In [3]:
df = df[['feedback_text', 'feedback_date', 'feedback_platform', 'tags']]

In [4]:
df.head()

Unnamed: 0,feedback_text,feedback_date,feedback_platform,tags
0,Must have app!. My MUST have app!!! My favorit...,2020-09-06 18:01:30+00:00,Apple App Store,Requirements irrelevant
1,Spotify review. I lt is the best music app I h...,2020-09-11 20:47:30+00:00,Apple App Store,Requirements irrelevant
2,Gaona music. I like Gaona music so much.,2020-09-11 20:51:19+00:00,Apple App Store,Requirements irrelevant
3,Spotify.. Fantastic app for old & brand new wo...,2020-09-08 08:03:31+00:00,Apple App Store,Requirements irrelevant
4,"Can’t see lyrics. Don’t know why, iPad doesn’t...",2020-09-10 17:20:43+00:00,Apple App Store,Unexpected behaviour (View lyrics)


In [5]:
df["is_irrel"] = df.tags.str.lower().str.contains("requirements irrelevant")
df["is_rel"] = ~df["is_irrel"]

In [6]:
df["is_bug"] = df.tags.str.lower().str.contains("crashing") | df.tags.str.lower().str.contains("network issue") | df.tags.str.lower().str.contains("unexpected behaviour")
df["is_feat"] = df.tags.str.lower().str.contains("feature")
df["is_oth"] = df.tags.str.lower().str.contains("requirements irrelevant") | df.tags.str.lower().str.contains("information seeking") | df.tags.str.lower().str.contains("informing") | df.tags.str.lower().str.contains("non-functional request") 

In [7]:
label_granularity = "bug_feature_other"
model_name = "distilbert-base-uncased"
label_granularity = "bug_feature_other"
is_multiclass = not (label_granularity == "requirements_relevance")
problem_type = "multi_label_classification" if is_multiclass else "single_label_classification"

In [8]:
label_order = ['bug', 'feature', 'other']

In [9]:
from dataset_preparation import FeedbackDataset
import numpy as np
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_text = tokenizer(text = df.feedback_text.tolist(), padding=True, truncation=True, max_length=256, return_tensors="np")

if is_multiclass:
    dataset = FeedbackDataset(tokenized_text, df[["is_bug", "is_feat", "is_oth"]].values, is_multiclass=True)
else:
    dataset = FeedbackDataset(tokenized_text, np.argmax(df[["is_irrel", "is_rel"]].values, axis=1), is_multiclass=False)

In [11]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig, AutoTokenizer
import torch
from sklearn.metrics import brier_score_loss, accuracy_score, precision_recall_fscore_support, ndcg_score, roc_auc_score

def get_aprf1(labels, pred_label):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred_label, average='binary', zero_division=0)
    acc = accuracy_score(labels, pred_label)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def get_cutoff_metrics(labels, preds, cutoff):
    pred_label = preds > cutoff
    metrics = get_aprf1(labels, pred_label)
    return {f"{k}_{cutoff}": v for k, v in metrics.items()}

def multiclass_compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    
    sig = torch.nn.Sigmoid()
    perc_preds = sig(torch.Tensor(logits)).numpy()
    
    results = {}
    
    num_pred_classes = logits.shape[1]
    for class_num in range(num_pred_classes):
        class_name = label_order[class_num]
        
        class_perc_preds = perc_preds[:, class_num]
        class_labels = labels[:, class_num]

        results[f"{class_name}_roc_auc"] = roc_auc_score(class_labels, class_perc_preds)
        cutoff_metrics = get_cutoff_metrics(class_labels, class_perc_preds, 0.5)
        cutoff_metrics = {f"{class_name}_{k}":v for k, v in cutoff_metrics.items()}
        results.update(cutoff_metrics)
    
    return results

def singleclass_compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    results = get_aprf1(labels, predictions)

    return results

compute_metrics = multiclass_compute_metrics if is_multiclass else singleclass_compute_metrics

training_args = TrainingArguments(
    output_dir=f'./results_test_{label_granularity}',          # output directory
    per_device_eval_batch_size=64,   # batch size for evaluation
    logging_dir=f'./logs_test_{label_granularity}',            # directory for storing logs
    fp16=True,
)

config = AutoConfig.from_pretrained(model_name, cache_dir="/mnt/Research/peter-research/peter_devine_nlp_models")
num_labels = 3 if label_granularity == "bug_feature_other" else 2
config.num_labels = num_labels
config.problem_type = problem_type
model = AutoModelForSequenceClassification.from_config(config)
model.load_state_dict(torch.load("/home/pdev438/projects/user_feedback_labeller/results_bug_feature_other/pytorch_model.bin"))

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset,         # training dataset
    eval_dataset=dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

In [12]:
trainer.evaluate()



{'eval_loss': 0.5074060559272766,
 'eval_bug_roc_auc': 0.9015106286748077,
 'eval_bug_accuracy_0.5': 0.826,
 'eval_bug_f1_0.5': 0.6813186813186813,
 'eval_bug_precision_0.5': 0.8611111111111112,
 'eval_bug_recall_0.5': 0.5636363636363636,
 'eval_feature_roc_auc': 0.7024460057246943,
 'eval_feature_accuracy_0.5': 0.704,
 'eval_feature_f1_0.5': 0.421875,
 'eval_feature_precision_0.5': 0.40298507462686567,
 'eval_feature_recall_0.5': 0.4426229508196721,
 'eval_other_roc_auc': 0.8021861316253838,
 'eval_other_accuracy_0.5': 0.706,
 'eval_other_f1_0.5': 0.7521079258010117,
 'eval_other_precision_0.5': 0.7263843648208469,
 'eval_other_recall_0.5': 0.7797202797202797,
 'eval_runtime': 2.9797,
 'eval_samples_per_second': 167.804}

In [17]:
platform_metrics = {}

for platform_name in df.feedback_platform.unique():
    platform_df = df[df.feedback_platform == platform_name]
    
    tokenized_text = tokenizer(text = platform_df.feedback_text.tolist(), padding=True, truncation=True, max_length=256, return_tensors="np")
    
    if is_multiclass:
        dataset = FeedbackDataset(tokenized_text, platform_df[["is_bug", "is_feat", "is_oth"]].values, is_multiclass=True)
    else:
        dataset = FeedbackDataset(tokenized_text, np.argmax(platform_df[["is_irrel", "is_rel"]].values, axis=1), is_multiclass=False)
    metrics = trainer.predict(dataset).metrics
    metrics["perc_pos_irrel"] = platform_df.is_irrel.mean()
    metrics["perc_pos_rel"] = platform_df.is_rel.mean()
    if is_multiclass:
        metrics["perc_pos_is_bug"] = platform_df.is_bug.mean()
        metrics["perc_pos_is_feat"] = platform_df.is_feat.mean()
        metrics["perc_pos_is_oth"] = platform_df.is_oth.mean()
    platform_metrics[platform_name] = metrics



In [18]:
pd.DataFrame(platform_metrics)

Unnamed: 0,Apple App Store,Spotify forum,Google Play Store,Reddit,Twitter
test_loss,0.441477,0.768893,0.299863,0.556092,0.470704
test_bug_roc_auc,0.89501,0.891131,0.965618,0.899185,0.91488
test_bug_accuracy_0.5,0.86,0.68,0.89,0.81,0.89
test_bug_f1_0.5,0.65,0.652174,0.702703,0.698413,0.731707
test_bug_precision_0.5,0.928571,0.967742,0.866667,0.846154,0.681818
test_bug_recall_0.5,0.5,0.491803,0.590909,0.594595,0.789474
test_feature_roc_auc,0.796263,0.630937,0.846917,0.541857,0.598238
test_feature_accuracy_0.5,0.76,0.62,0.82,0.57,0.75
test_feature_f1_0.5,0.52,0.472222,0.470588,0.31746,0.324324
test_feature_precision_0.5,0.448276,0.515152,0.470588,0.277778,0.315789


In [11]:
dataset[100:110]

{'input_ids': tensor([[  101,  2190,  2189,  ...,     0,     0,     0],
         [  101,  1045,  2293,  ...,     0,     0,     0],
         [  101,  2190,  2377,  ...,     0,     0,     0],
         ...,
         [  101, 10439,  2562,  ...,     0,     0,     0],
         [  101,  3115,  2028,  ...,     0,     0,     0],
         [  101,  2006,  2026,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 0, 0, 0, 1, 1, 0, 1, 0, 1])}