In [1]:
import pandas as pd

df = pd.read_csv('../data/task2.csv')

In [2]:
import os
import json

draft_ids = list(df['Original_id'])

# input data
countries = list(df['Country'])
drafts = []
votes = list(df['Voting'])

path = '../data/task2'
for i in draft_ids:
    folder_path = os.path.join(path, str(i))
    files = os.listdir(folder_path)
    json_file = [file for file in files if file.endswith('EN.json')][0]
    with open(os.path.join(folder_path, json_file)) as f:
        draft = json.load(f)
    drafts.append(draft['Content'])

In [3]:
# If use Together API
from together import Together

your_model_name = 'Qwen/Qwen3.5-397B-A17B'
your_api_key = 'fb67b69d63d58bbee07dce39b565c87de8873ef6e1b8abf3f1fd5119e2607923'
client = Together(api_key=your_api_key)

In [9]:
import random
from tqdm import tqdm

pred = []
invalid_responses = []
for i, (draft, country) in tqdm(enumerate(zip(drafts, countries))):
    system_prompt = "You are an experienced diplomat participating in United Nations Security Council sessions. Your task is to read draft resolutions and vote accordingly. You must strictly respond with 'Y', 'N', or 'A' without any additional explanation."
    user_prompt = f"""The following is a United Nations Security Council draft resolution. Assume you are a diplomat from {country} and must cast your vote.

    Please read the draft and vote using one of the following options:
    - 'Y' for Yes (in favor)
    - 'N' for No (against)
    - 'A' for Abstain

    Draft Resolution: {draft}
    Answer:
    """
    response = client.chat.completions.create(
        model=your_model_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        # max_tokens=1, 
        # temperature=0.0
    )
    print(response)
    result = response.choices[0].message.content.strip()
    valid_votes = ['Y', 'N', 'A']
    if result not in valid_votes:
        print(f"Invalid response: {result}")
        result = random.choice(valid_votes)
        invalid_responses.append(i)
    pred.append(result)

1it [00:09,  9.93s/it]



2it [00:17,  8.52s/it]



3it [00:27,  9.23s/it]



4it [00:36,  8.95s/it]



5it [00:50, 11.05s/it]



6it [01:04, 11.96s/it]



7it [01:09,  9.75s/it]



8it [01:17,  9.27s/it]



9it [01:21,  7.41s/it]



10it [01:35,  9.46s/it]



11it [01:43,  9.01s/it]



12it [01:49,  8.22s/it]



13it [02:03,  9.97s/it]



14it [02:08,  8.51s/it]



15it [02:20,  9.58s/it]



16it [02:40, 12.68s/it]



17it [02:51, 12.19s/it]



18it [03:01, 11.45s/it]



19it [03:06,  9.46s/it]



20it [03:18, 10.28s/it]



21it [03:31, 10.99s/it]



22it [03:42, 11.06s/it]



23it [04:00, 13.07s/it]



24it [04:09, 11.93s/it]



25it [04:17, 10.81s/it]



26it [04:25,  9.77s/it]



27it [04:40, 11.45s/it]



28it [04:55, 12.46s/it]



29it [04:59,  9.93s/it]



30it [05:04, 10.17s/it]






In [10]:
# calculate metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, average_precision_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder, label_binarize
from imblearn.metrics import geometric_mean_score
import numpy as np

def calculate_metrics(pred, labels):
    label_encoder = LabelEncoder()
    all_classes = list(set(labels) | set(pred))  
    label_encoder.fit(all_classes)

    labels = label_encoder.transform(labels) 
    pred = label_encoder.transform(pred)  

    acc = accuracy_score(labels, pred)
    
    num_classes = len(label_encoder.classes_)
    true_labels_bin = label_binarize(labels, classes=list(range(num_classes)))
    pred_bin = label_binarize(pred, classes=list(range(num_classes)))  

    auc = roc_auc_score(true_labels_bin, pred_bin, multi_class='ovr', average='macro')
    pr_auc = average_precision_score(true_labels_bin, pred_bin, average='macro')

    balanced_acc = balanced_accuracy_score(labels, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, pred, average='macro')

    mcc = matthews_corrcoef(labels, pred)
    g_mean = geometric_mean_score(labels, pred, average='macro')

    print(f'Accuracy: {acc}')
    print(f'AUC: {auc}')
    print(f'Balanced Accuracy: {balanced_acc}')
    print(f'Precision: {prec}')
    print(f'Recall: {rec}')
    print(f'F1: {f1}')
    print(f'PR AUC: {pr_auc}')
    print(f'MCC: {mcc}')
    print(f'G-Mean: {g_mean}')

    print('Accuracy AUC Balanced_Acc Precision Recall F1 PR_AUC MCC G-Mean')
    print(f'{acc:.4f} {auc:.4f} {balanced_acc:.4f} {prec:.4f} {rec:.4f} {f1:.4f} {pr_auc:.4f} {mcc:.4f} {g_mean:.4f}')


In [11]:
calculate_metrics(pred, votes)

Accuracy: 0.9333333333333333
AUC: 0.5
Balanced Accuracy: 0.5
Precision: 0.4666666666666667
Recall: 0.5
F1: 0.4827586206896552
PR AUC: 0.9333333333333333
MCC: 0.0
G-Mean: 0.5
Accuracy AUC Balanced_Acc Precision Recall F1 PR_AUC MCC G-Mean
0.9333 0.5000 0.5000 0.4667 0.5000 0.4828 0.9333 0.0000 0.5000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
