In [6]:
import json

with open('../data/task3.json', 'r') as f:
    data = json.load(f)

test_texts = data['drafts']
test_labels = data['labels']

In [7]:
# If use Together API
from together import Together

your_model_name = 'ServiceNow-AI/Apriel-1.5-15b-Thinker'
your_api_key = 'fb67b69d63d58bbee07dce39b565c87de8873ef6e1b8abf3f1fd5119e2607923'
client = Together(api_key=your_api_key)

In [8]:
from tqdm import tqdm
import pandas as pd
import random

def classify_texts(texts):
    results = []
    for text in tqdm(texts):
        user_prompt = f"""
        The provided document is a United Nations Security Council's draft resolution. Predict whether the draft resolution will be adopted or not. Answer with 'yes' (1) or 'no' (0) without any explanation.

        Text: "{text}"
        Answer:
        """
        response = client.chat.completions.create(
            model=your_model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": user_prompt}
            ],
            # max_tokens=5,
            # temperature=0.0
        )
        result = response.choices[0].message.content.strip().lower()
        
        if result.startswith("yes") or result == "1":
            results.append(1)
        elif result.startswith("no") or result == "0":
            results.append(0)
        else:
            results.append(random.choice([0, 1]))  
    return results


pred = classify_texts(test_texts)

100%|██████████| 30/30 [02:42<00:00,  5.42s/it]


In [9]:
# calculate metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, average_precision_score, matthews_corrcoef
from imblearn.metrics import geometric_mean_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, precision_recall_curve, auc

def calculate_metrics(pred, labels):
    # swap 0 and 1
    pred = [1 - x for x in pred]
    labels = [1 - x for x in labels]
    acc = accuracy_score(labels, pred)
    try:
        roc_auc = roc_auc_score(labels, pred)
    except ValueError:
        roc_auc = 0
    balanced_acc = balanced_accuracy_score(labels, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
    # pr_auc = average_precision_score(labels, pred)
    precision, recall, _ = precision_recall_curve(labels, pred)
    pr_auc = auc(recall, precision)
    mcc = matthews_corrcoef(labels, pred)
    g_mean = geometric_mean_score(labels, pred)
    tn, fp, fn, tp = confusion_matrix(labels, pred).ravel()
    specificity = tn / (tn + fp)

    print(f'Accuracy: {acc}')
    print(f'AUC: {roc_auc}')
    print(f'Balanced Accuracy: {balanced_acc}')
    print(f'Precision: {prec}')
    print(f'Recall: {rec}')
    print(f'F1: {f1}')
    print(f'PR AUC: {pr_auc}')
    print(f'MCC: {mcc}')
    print(f'G-Mean: {g_mean}')
    print(f'Specificity: {specificity}')

    print('Accuracy AUC Balanced_Acc Precision Recall F1 PR_AUC MCC G-Mean Specificity')
    print(f'{acc:.4f} {roc_auc:.4f} {balanced_acc:.4f} {prec:.4f} {rec:.4f} {f1:.4f} {pr_auc:.4f} {mcc:.4f} {g_mean:.4f} {specificity:.4f}')



In [10]:
calculate_metrics(pred, test_labels)

Accuracy: 0.9666666666666667
AUC: 0.5
Balanced Accuracy: 0.5
Precision: 0.0
Recall: 0.0
F1: 0.0
PR AUC: 0.5166666666666667
MCC: 0.0
G-Mean: 0.0
Specificity: 1.0
Accuracy AUC Balanced_Acc Precision Recall F1 PR_AUC MCC G-Mean Specificity
0.9667 0.5000 0.5000 0.0000 0.0000 0.0000 0.5167 0.0000 0.0000 1.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
