# Comparison with DeepSeek API

Out of curiosity, let us compare our results to results obtained through an API

In [None]:
!pip install python-dotenv pandas numpy scikit-learn

In [132]:
import json
from collections import Counter
import pandas as pd
import os
from dotenv import load_dotenv
import requests
import numpy as np
import re
import ast
from sklearn.metrics import f1_score

# First we have to load our data

In [19]:
def load_jsonl(file_path):
    """
    Load a JSONL file and return a list of JSON objects.
    :param file_path: str, path to the JSONL file
    :return: list of dicts, each representing a JSON object
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

In [20]:
train_file_path = 'data_germeval/train.jsonl'
dev_file_path = 'data_germeval/development.jsonl'
test_file_path = 'data_germeval/test.jsonl'

In [21]:
train_data = load_jsonl(train_file_path)
dev_data = load_jsonl(dev_file_path)
test_data = load_jsonl(test_file_path)

# Second we have to generate the bins as in germeval.ipybn

In [25]:
def assign_bin_maj(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if a majority of annotators assigned a label other than 0-Kein, predicts 0 if a majority assigned 0-Kein. If there was no majority, either label is considered correct for evaluation.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: label
    """
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        label_counts = Counter(labels)
        majority_label, majority_count = label_counts.most_common(1)[0]
        bin_maj_label = 1 if majority_label != '0-Kein' else 0
    else:
        bin_maj_label = None
    return bin_maj_label
    
def assign_bin_one(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if at least one annotator assigned a label other than 0-Kein, 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: label
    """
    if not is_test:
        bin_one_label = 1 if any(ann['label'] != '0-Kein' for ann in item['annotations']) else 0
    else:
        bin_one_label = None
    return bin_one_label

def assign_bin_all(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and computes 1 if all annotators assigned labels other than 0-Kein, 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: label
    """
    if not is_test:
        bin_all_label = 1 if all(ann['label'] != '0-Kein' for ann in item['annotations']) else 0
    else:
        bin_all_label = None
    return bin_all_label

def assign_multi_maj(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and predicts the majority label if there is one, if there is no majority label, any of the labels assigned is counted as a correct prediction for evaluation.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: label
    """
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        label_counts = Counter(labels)
        majority_label, majority_count = label_counts.most_common(1)[0]
        multi_maj_label = majority_label if majority_count > len(labels) / 2 else labels[0]
        multi_maj_label = int(multi_maj_label.split('-')[0])
    else:
        multi_maj_label = None
    return multi_maj_label

def assign_disagree_bin(item, is_test=False):
    """
    takes a tweet and its annotations (if available) and predicts 1 if there is a disagreement between annotators on 0-Kein versus all other labels and 0 otherwise.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: label
    """
    if not is_test:
        labels = [ann['label'] for ann in item['annotations']]
        unique_labels = set(labels)
        disagree_bin_label = 1 if '0-Kein' in unique_labels and len(unique_labels) > 1 else 0
    else:
        disagree_bin_label = None
    return disagree_bin_label

In [29]:
def total_data(item, is_test=False):
    """
    collects all labels described above for one tweet.
    :param item: dictionary of the form {'id': , 'text': , 'annotators': }
    :param is_test: if False annotations are available. If True not
    :return: dictionary of the form {'id': , 'text': , 'bin_maj_label':, 'bin_one_label': , ... }
    """
    text = item['text']
    text = text.replace('\n', ' ')
    return {'id': item['id'], 'text': text, 'bin_maj_label': assign_bin_maj(item), 'bin_one_label': assign_bin_one(item),
            'bin_all_label': assign_bin_all(item), 'multi_maj_label': assign_multi_maj(item), 
            'disagree_bin_label': assign_disagree_bin(item)}

In [40]:
def combine_data(data, dataframe = False):
    """
    iterates over a list of tweets and annotations
    :param data: list of dictionaries
    :return: list of dictionaries or dataframe
    """
    data_with_labels = [total_data(item) for item in data]
    if dataframe:
        header = data_with_labels[0].keys()
        data_with_labels = pd.DataFrame(data_with_labels, columns=header)
    return data_with_labels
        

In [None]:
train_data_labeled = combine_data(train_data)

In [131]:
dev_data_labeled = combine_data(dev_data)
dev_df = combine_data(dev_data, dataframe = True)

# Now the API part

In [35]:
load_dotenv()
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY_25")
print(f"API Key loaded: {DEEPSEEK_API_KEY is not None}")

API Key loaded: True


# Let's first do it the 'simple' way: We directly provide the AI with a few examples and check what it does.

# Let's do a test

In [182]:
len_train = len(train_data_labeled)
random_indices = np.random.randint(0, len_train, size=100) 
example_texts = [train_data_labeled[i] for i in random_indices]
prompt_english = f"""
**Task:** Predict sexism annotation labels for a new text based on the following label definitions.

**Label Definitions:**
- 'bin_maj_label': A majority of annotators found the text to be sexist.
- 'bin_one_label': At least one annotator found the text to be sexist.
- 'bin_all_label': All annotators found the text to be sexist.
- 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
- 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.

**Examples from the Dataset:**
{chr(10).join(str(example) for example in example_texts)}

**Text to Analyze:**
'{dev_data_labeled[4]['text']}'

**Instructions:**
Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in the following format:
{{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
"""

In [183]:
api_url = "https://api.deepseek.com/v1/chat/completions"  
headers = {
    "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
    "Content-Type": "application/json"
}

data = {
    "model": "deepseek-chat", 
    "messages": [
        {"role": "user", "content": prompt_english},
    ],
    "max_tokens": 70
}
response = requests.post(api_url, headers=headers, json=data)

In [184]:
response.json()['choices'][0]['message']['content']

"{'bin_maj_label': 1, 'bin_one_label': 1, 'bin_all_label': 0, 'multi_maj_label': 2, 'disagree_bin_label': 1}"

# Let's generalize

In [343]:
#Do we want to use the API?
generate = False

In [252]:
len_train = len(train_data_labeled)

In [321]:
pattern_1 = r"\{'bin_maj_label':\s*(\d),\s*'bin_one_label':\s*(\d),\s*'bin_all_label':\s*(\d),\s*'multi_maj_label':\s*(\d),\s*'disagree_bin_label':\s*(\d)\}"
pattern_2 = r"\{\\\s*n\s*'bin_maj_label':\s*(\d),\s*\\\s*n\s*'bin_one_label':\s*(\d),\s*\\\s*n\s*'bin_all_label':\s*(\d),\s*\\\s*n\s*'multi_maj_label':\s*(\d),\s*\\\s*n\s*'disagree_bin_label':\s*(\d)\s*\\\s*n\s*\}"

In [322]:
def extract_dict_from_response(response_string):
    match1 = re.search(pattern_1, response_string, re.DOTALL)
    match2 = re.search(pattern_2, response_string, re.DOTALL)
    if match1:
        dict_str = match1.group(0)
        result_dict = ast.literal_eval(dict_str)
        return [True, result_dict]
    elif match2:
        dict_str = match2.group(0)
        result_dict = ast.literal_eval(dict_str)
        return [True, result_dict]   
    else:
        print("No dictionary pattern found in the response.")
        print(response_string)
        return [False]

In [338]:
def compute_metrics(lbls, preds):
    f1 = f1_score(lbls, preds, average='weighted')
    return {'f1': f1}

def compute_f1(real_data, prediction):
    print(f"Dev set F1 score Bin Maj: {compute_metrics(real_data['bin_maj_label'], prediction['bin_maj_label'])['f1']}")
    print(f"Dev set F1 score Bin One: {compute_metrics(real_data['bin_one_label'], prediction['bin_one_label'])['f1']}")
    print(f"Dev set F1 score Bin All: {compute_metrics(real_data['bin_all_label'], prediction['bin_all_label'])['f1']}")
    print(f"Dev set F1 score Multi Maj: {compute_metrics(real_data['multi_maj_label'], prediction['multi_maj_label'])['f1']}")
    print(
        f"Dev set F1 score Disagree Bin: {compute_metrics(real_data['disagree_bin_label'], prediction['disagree_bin_label'])['f1']}")

# Zero Shot:

In [332]:
def get_prompt_zero_shot(text_to_analyze):
    prompt = f"""
    **Task:** Predict sexism annotation labels for a new text based on the following label definitions.
    
    **Label Definitions:**
    - 'bin_maj_label': A majority of annotators found the text to be sexist.
    - 'bin_one_label': At least one annotator found the text to be sexist.
    - 'bin_all_label': All annotators found the text to be sexist.
    - 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
    - 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.
    
    **Text to Analyze:**
    '{text_to_analyze}'
    
    **Instructions:**
    Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in exactly the following format 
    (no spaces or newlines!):
    {{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
    """
    return prompt

In [333]:
def generate_labels_zero_shot(text_to_analyze):
    api_url = "https://api.deepseek.com/v1/chat/completions"  
    headers = {
    "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
    "Content-Type": "application/json"
    }
    prompt = get_prompt_zero_shot(text_to_analyze)
    dat = {
    "model": "deepseek-chat", 
    "messages": [
        {"role": "user", "content": prompt},
    ],
    "max_tokens": 70
    }
    respo = requests.post(api_url, headers=headers, json=dat)
    return respo.json()['choices'][0]['message']['content']

In [334]:
def eval_string_zero_shot(item):
    #print(item)
    result = generate_labels_zero_shot(item['text'])
    result = extract_dict_from_response(result)
    while not result[0]:
        result = extract_dict_from_response(generate_labels_zero_shot(item['text']))
    eval_dict = dict()
    eval_dict['id'] = item['id']
    eval_dict['text'] = item['text']
    eval_dict.update(result[1])
    return eval_dict

In [None]:
if generate:
    dev_data_predicted_0 = [eval_string_zero_shot(item) for item in dev_data[:100]]
    dev_df_predicted_0 = pd.DataFrame(dev_data_predicted_0, columns=dev_data_predicted_0[0].keys())

In [339]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_0)

Dev set F1 score Bin Maj: 0.6966680446465482
Dev set F1 score Bin One: 0.749474527074367
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.5660397497239602
Dev set F1 score Disagree Bin: 0.6165782044042915


These results are ok, but maybe we can improve them by giving DeepSeek some examples?

# One Shot

In [229]:
def get_prompt(text_to_analyze, num_examples):
    random_indices = np.random.randint(0, len_train, size=num_examples) 
    example_texts = [train_data_labeled[i] for i in random_indices]
    prompt = f"""
    **Task:** Predict sexism annotation labels for a new text based on the following label definitions.
    
    **Label Definitions:**
    - 'bin_maj_label': A majority of annotators found the text to be sexist.
    - 'bin_one_label': At least one annotator found the text to be sexist.
    - 'bin_all_label': All annotators found the text to be sexist.
    - 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
    - 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.
    
    **Examples from the Dataset:**
    {chr(10).join(str(example) for example in example_texts)}
    
    **Text to Analyze:**
    '{text_to_analyze}'
    
    **Instructions:**
    Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in exactly the following format 
    (no spaces or newlines!):
    {{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
    """
    return prompt

In [230]:
def generate_labels(text_to_analyze, num_examples):
    api_url = "https://api.deepseek.com/v1/chat/completions"  
    headers = {
    "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
    "Content-Type": "application/json"
    }
    prompt = get_prompt(text_to_analyze, num_examples)
    dat = {
    "model": "deepseek-chat", 
    "messages": [
        {"role": "user", "content": prompt},
    ],
    "max_tokens": 70
    }
    respo = requests.post(api_url, headers=headers, json=dat)
    return respo.json()['choices'][0]['message']['content']

In [307]:
def eval_string(item, num_examples):
    #print(item)
    result = extract_dict_from_response(generate_labels(item['text'], num_examples))
    while not result[0]:
        result = extract_dict_from_response(generate_labels(item['text'], num_examples))
    eval_dict = dict()
    eval_dict['id'] = item['id']
    eval_dict['text'] = item['text']
    eval_dict.update(result[1])
    return eval_dict

In [None]:
if generate:
    dev_data_predicted = [eval_string(item, 100) for item in dev_data[:100]]
    dev_df_predicted = pd.DataFrame(dev_data_predicted, columns = dev_data_predicted[0].keys())

In [340]:
compute_f1(dev_df.iloc[:100], dev_df_predicted)

Dev set F1 score Bin Maj: 0.8191471215351812
Dev set F1 score Bin One: 0.8200720288115246
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.7006038647342995
Dev set F1 score Disagree Bin: 0.7059975520195839


Pro: Results not bad
Contra: Took quite a while and costs a bit (0.5$). Can we do it with less examples?

In [None]:
if generate:
    dev_data_predicted_5 = [eval_string(item, 5) for item in dev_data[:100]]
    dev_df_predicted_5 = pd.DataFrame(dev_data_predicted_5, columns = dev_data_predicted_5[0].keys())

In [None]:
if generate:
    dev_data_predicted_10 = [eval_string(item, 10) for item in dev_data[:100]]
    dev_df_predicted_10 = pd.DataFrame(dev_data_predicted_10, columns=dev_data_predicted_10[0].keys())

In [341]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_5)

Dev set F1 score Bin Maj: 0.78288
Dev set F1 score Bin One: 0.82991499149915
Dev set F1 score Bin All: 0.813763440860215
Dev set F1 score Multi Maj: 0.6724038713910762
Dev set F1 score Disagree Bin: 0.735386189258312


In [342]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_10)

Dev set F1 score Bin Maj: 0.8109375
Dev set F1 score Bin One: 0.8197839135654261
Dev set F1 score Bin All: 0.8596756756756757
Dev set F1 score Multi Maj: 0.676089427891324
Dev set F1 score Disagree Bin: 0.7157851662404093


The 5 examples version is slightly worse than the 10 examples version. The 10 examples version is slightly worse than the 100 examples version.
10 examples seem to be a good number of examples as performance is almost as good as the 100 example version. The 10 example version cost ~0.1$

Pro: Results are better than the prediction of the fine-tuned BERT models.
Contra: There are probably issues with reproducibility. One would have to average over many more predictions to truly know if the API yields reliable results.