# Comparison with DeepSeek API

Out of curiosity, let us compare our results to results obtained through an API

In [None]:
!pip install python-dotenv pandas numpy scikit-learn

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import requests
import numpy as np
from pathlib import Path

In [2]:
from functions import load_jsonl, combine_data, extract_dict_from_response, compute_f1

# Global Variables

In [34]:
#data
train_file_path = Path('data_germeval/train.jsonl')
dev_file_path = Path('data_germeval/development.jsonl')

#API key
load_dotenv()
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY_25")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_25")

#API model
deepseek_model = "deepseek-chat"
openAI_model = "gpt-4o"

#API url
deepseek_api_url = "https://api.deepseek.com/v1/chat/completions"  
openAI_api_url = "https://api.openai.com/v1/chat/completions"

#API data tuple
deepseek = (DEEPSEEK_API_KEY, deepseek_model, deepseek_api_url)
openAI = (OPENAI_API_KEY, openAI_model, openAI_api_url)

#Do we want to use the API?
run_this = False

In [4]:
#did the API key load?
print(f"API Key loaded: {DEEPSEEK_API_KEY is not None}")
print(f"API Key loaded: {OPENAI_API_KEY is not None}")

API Key loaded: True
API Key loaded: True


# First we have to load our data

In [5]:
train_data = load_jsonl(train_file_path)
dev_data = load_jsonl(dev_file_path)

# Second we have to generate the bins as in germeval.ipybn

In [6]:
train_data_labeled = combine_data(train_data)
dev_data_labeled = combine_data(dev_data)
dev_df = combine_data(dev_data, dataframe = True)

# Now the API part

# Let's do it the 'simple' way: We directly provide the AI with a few examples and check what it does.

# DeepSeek first!

# Let's do a test

In [182]:
len_train = len(train_data_labeled)
random_indices = np.random.randint(0, len_train, size=100) 
example_texts = [train_data_labeled[i] for i in random_indices]
prompt_english = f"""
**Task:** Predict sexism annotation labels for a new text based on the following label definitions.

**Label Definitions:**
- 'bin_maj_label': A majority of annotators found the text to be sexist.
- 'bin_one_label': At least one annotator found the text to be sexist.
- 'bin_all_label': All annotators found the text to be sexist.
- 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
- 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.

**Examples from the Dataset:**
{chr(10).join(str(example) for example in example_texts)}

**Text to Analyze:**
'{dev_data_labeled[4]['text']}'

**Instructions:**
Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in the following format:
{{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
"""

In [183]:
headers = {
    "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
    "Content-Type": "application/json"
}

data = {
    "model": deepseek_model, 
    "messages": [
        {"role": "user", "content": prompt_english},
    ],
    "max_tokens": 70
}
response = requests.post(deepseek_api_url, headers=headers, json=data)

In [184]:
response.json()['choices'][0]['message']['content']

"{'bin_maj_label': 1, 'bin_one_label': 1, 'bin_all_label': 0, 'multi_maj_label': 2, 'disagree_bin_label': 1}"

# Let's generalize

# Zero Shot:

In [17]:
def get_prompt_zero_shot(text_to_analyze):
    """
    generates a fitting prompt
    :param text_to_analyze: str, text that is to be analyzed
    :return: str, prompt to send to the API
    """
    prompt = f"""
    **Task:** Predict sexism annotation labels for a new text based on the following label definitions.
    
    **Label Definitions:**
    - 'bin_maj_label': A majority of annotators found the text to be sexist.
    - 'bin_one_label': At least one annotator found the text to be sexist.
    - 'bin_all_label': All annotators found the text to be sexist.
    - 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
    - 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.
    
    **Text to Analyze:**
    '{text_to_analyze}'
    
    **Instructions:**
    Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in exactly the following format 
    (no spaces or newlines!). <value> must always be an integer::
    {{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
    """
    return prompt

In [8]:
def generate_labels_zero_shot(text_to_analyze, api_key, model, api_url):
    """
    Generates a prediction for a given text via the DeepSeek API
    :param text_to_analyze: str, text that is to be analyzed
    :param api_key: API key
    :param model: str, name of model
    :param api_url: str, link for model
    :return: str, answer of the LLM
    """  
    headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
    }
    prompt = get_prompt_zero_shot(text_to_analyze)
    dat = {
    "model": model, 
    "messages": [
        {"role": "user", "content": prompt},
    ],
    "max_tokens": 70
    }
    respo = requests.post(api_url, headers=headers, json=dat)
    return respo.json()['choices'][0]['message']['content']

In [9]:
def eval_string_zero_shot(item,  api_key, model, api_url):
    """
    combines the whole process
    :param item: dictionary containing our data
    :param api_key: API key
    :param model: str, name of model
    :param api_url: str, link for model
    :return: dictionary with our results
    """
    #print(item)
    result = generate_labels_zero_shot(item['text'],  api_key, model, api_url)
    result = extract_dict_from_response(result)
    while not result[0]:
        result = extract_dict_from_response(generate_labels_zero_shot(item['text'],  api_key, model, api_url))
    eval_dict = dict()
    eval_dict['id'] = item['id']
    eval_dict['text'] = item['text']
    eval_dict.update(result[1])
    return eval_dict

In [None]:
if run_this:
    dev_data_predicted_0 = [eval_string_zero_shot(item, *deepseek) for item in dev_data[:100]]
    dev_df_predicted_0 = pd.DataFrame(dev_data_predicted_0, columns=dev_data_predicted_0[0].keys())

In [339]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_0)

Dev set F1 score Bin Maj: 0.6966680446465482
Dev set F1 score Bin One: 0.749474527074367
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.5660397497239602
Dev set F1 score Disagree Bin: 0.6165782044042915


These results are ok, but maybe we can improve them by giving DeepSeek some examples?

# Few Shot

In [10]:
len_train = len(train_data_labeled)

In [16]:
def get_prompt(text_to_analyze, num_examples):
    """
    generates a fitting prompt. Includes a number of examples randomly picked from the training data
    :param text_to_analyze: str, text that is to be analyzed
    :param num_examples: int, number of examples to include in the prompt
    :return: str, prompt to send to the API
    """
    random_indices = np.random.randint(0, len_train, size=num_examples) 
    example_texts = [train_data_labeled[i] for i in random_indices]
    prompt = f"""
    **Task:** Predict sexism annotation labels for a new text based on the following label definitions.
    
    **Label Definitions:**
    - 'bin_maj_label': A majority of annotators found the text to be sexist.
    - 'bin_one_label': At least one annotator found the text to be sexist.
    - 'bin_all_label': All annotators found the text to be sexist.
    - 'multi_maj_label': The multi-class label (integer from 0 to 4) that the most annotators assigned.
    - 'disagree_bin_label': The annotators disagreed on the binary (sexist/not sexist) classification.
    
    **Examples from the Dataset:**
    {chr(10).join(str(example) for example in example_texts)}
    
    **Text to Analyze:**
    '{text_to_analyze}'
    
    **Instructions:**
    Analyze the text above and predict its labels. Return ONLY a valid Python dictionary in exactly the following format 
    (no spaces or newlines!). <value> must always be an integer::
    {{'bin_maj_label': <value>, 'bin_one_label': <value>, 'bin_all_label': <value>, 'multi_maj_label': <value>, 'disagree_bin_label': <value>}}
    """
    return prompt

In [12]:
def generate_labels(text_to_analyze, num_examples, api_key, model, api_url):
    """
    Generates a prediction for a given text via the DeepSeek API
    :param text_to_analyze: str, text that is to be analyzed
    :param num_examples: int, number of examples to include in the prompt
    :param api_key: API key
    :param model: str, name of model
    :param api_url: str, link for model
    :return: str, answer of the LLM
    """
    headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
    }
    prompt = get_prompt(text_to_analyze, num_examples)
    dat = {
    "model": model, 
    "messages": [
        {"role": "user", "content": prompt},
    ],
    "max_tokens": 70
    }
    respo = requests.post(api_url, headers=headers, json=dat)
    return respo.json()['choices'][0]['message']['content']

In [13]:
def eval_string(item, num_examples, api_key, model, api_url):
    """
    combines the whole process
    :param item: dictionary containing our data
    :param num_examples: int, number of examples to include in the prompt  
    :param api_key: API key
    :param model: str, name of model
    :param api_url: str, link for model 
    :return: dictionary with our results
    """
    #print(item)
    result = extract_dict_from_response(generate_labels(item['text'], num_examples, api_key, model, api_url))
    while not result[0]:
        result = extract_dict_from_response(generate_labels(item['text'], num_examples, api_key, model, api_url))
    eval_dict = dict()
    eval_dict['id'] = item['id']
    eval_dict['text'] = item['text']
    eval_dict.update(result[1])
    return eval_dict

In [None]:
if run_this:
    dev_data_predicted = [eval_string(item, 100, *deepseek) for item in dev_data[:100]]
    dev_df_predicted = pd.DataFrame(dev_data_predicted, columns = dev_data_predicted[0].keys())

In [340]:
compute_f1(dev_df.iloc[:100], dev_df_predicted)

Dev set F1 score Bin Maj: 0.8191471215351812
Dev set F1 score Bin One: 0.8200720288115246
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.7006038647342995
Dev set F1 score Disagree Bin: 0.7059975520195839


Pro: Results not bad
Contra: Took quite a while and costs a bit (0.5$). Can we do it with fewer examples?

In [None]:
if run_this:
    dev_data_predicted_5 = [eval_string(item, 5, *deepseek) for item in dev_data[:100]]
    dev_df_predicted_5 = pd.DataFrame(dev_data_predicted_5, columns = dev_data_predicted_5[0].keys())
    dev_data_predicted_10 = [eval_string(item, 10, *deepseek) for item in dev_data[:100]]
    dev_df_predicted_10 = pd.DataFrame(dev_data_predicted_10, columns=dev_data_predicted_10[0].keys())

In [341]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_5)

Dev set F1 score Bin Maj: 0.78288
Dev set F1 score Bin One: 0.82991499149915
Dev set F1 score Bin All: 0.813763440860215
Dev set F1 score Multi Maj: 0.6724038713910762
Dev set F1 score Disagree Bin: 0.735386189258312


In [342]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_10)

Dev set F1 score Bin Maj: 0.8109375
Dev set F1 score Bin One: 0.8197839135654261
Dev set F1 score Bin All: 0.8596756756756757
Dev set F1 score Multi Maj: 0.676089427891324
Dev set F1 score Disagree Bin: 0.7157851662404093


# Conclusion (DeepSeek)

The 5-examples version is slightly worse than the 10-examples version. The 10-examples version is slightly worse than the 100-examples version.
10 examples seem to be a good number of examples as performance is almost as good as the 100-example version, but the 10-example version cost ~0.1$

Pro: Results are better than the prediction of the fine-tuned BERT models.
Contra: There are probably issues with reproducibility. One would have to average over many more predictions to truly know if the API yields reliable results.

# Let's compare the OpenAI API

NOTE: I first tried "gpt-3.5-turbo" but results were less good, so this is "gpt-4o"

In [35]:
if run_this:
    dev_data_predicted_0_openAI = [eval_string_zero_shot(item, *openAI) for item in dev_data[:100]]
    dev_df_predicted_0_openAI = pd.DataFrame(dev_data_predicted_0_openAI, columns=dev_data_predicted_0_openAI[0].keys())
    dev_data_predicted_5_openAI = [eval_string(item, 5, *openAI) for item in dev_data[:100]]
    dev_df_predicted_5_openAI = pd.DataFrame(dev_data_predicted_5_openAI, columns=dev_data_predicted_5_openAI[0].keys())
    dev_data_predicted_10_openAI = [eval_string(item, 10, *openAI) for item in dev_data[:100]] #This can lead to error messages when the model is busy. In this case, either try again, split the data into smaller batches and pause between running them, or switch to an older model (like gpt-3.5-turbo)
    dev_df_predicted_10_openAI = pd.DataFrame(dev_data_predicted_10_openAI, columns=dev_data_predicted_10_openAI[0].keys())

In [58]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_0_openAI)

Dev set F1 score Bin Maj: 0.7168081494057725
Dev set F1 score Bin One: 0.7391666666666665
Dev set F1 score Bin All: 0.8238297872340425
Dev set F1 score Multi Maj: 0.535944055944056
Dev set F1 score Disagree Bin: 0.6632692307692308


In [59]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_5_openAI)

Dev set F1 score Bin Maj: 0.7356156156156156
Dev set F1 score Bin One: 0.7488721804511278
Dev set F1 score Bin All: 0.8219864995178399
Dev set F1 score Multi Maj: 0.6247619047619047
Dev set F1 score Disagree Bin: 0.6177357339029601


In [60]:
compute_f1(dev_df.iloc[:100], dev_df_predicted_10_openAI)

Dev set F1 score Bin Maj: 0.7519945725915875
Dev set F1 score Bin One: 0.779735894357743
Dev set F1 score Bin All: 0.8389743589743591
Dev set F1 score Multi Maj: 0.6463893766461808
Dev set F1 score Disagree Bin: 0.6666995073891626


# Conclusion

OpenAI, like DeepSeek profits from being shown more examples. 

However, with the same number of examples, OpenAI yields worse results than DeepSeek. 

The 10-examples version of DeepSeek seems to be a good compromise, as it both performs well and is low cost.