# Second Model Verification Experiments

In [1]:
# %pip install langchain langchain-core langchain-community langchain-openai
# %pip install --force-reinstall typing-extensions==4.5
# %pip install --force-reinstall openai==1.8

## Load Libraries

In [1]:
import os
import sys
import pandas as pd
sys.path.append('..')
from src.prompt import PROMPTS
from src.evaluation import Evalator
from src.helpers import fix_decision_parser
from src.experiment_runner import run_experimment_second_model_verify

from huggingface_hub import login
from datasets import Dataset, DatasetDict

pd.set_option('display.max_colwidth', None)

login(token=os.environ.get("HF_TOKEN"), add_to_git_credential=True)

  _torch_pytree._register_pytree_node(


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/nazardrushchak/.cache/huggingface/token
Login successful


## `gpt-3.5-turbo-0125`: English Language experiment

In [2]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", model_kwargs={"seed": 42, "top_p": 0.0}, temperature=0.0)

In [3]:
chain = PROMPTS['second_prompt_verification_en'] | llm

In [4]:
file_paths = run_experimment_second_model_verify(
    folder_path="../data/second_model_verification",
    chain=chain,
    based_on_results="../data/optimized_parameters",
    lang="en",
    batch_size=50,
    test_id = ["dcd1541d-010b-5fbc-a0af-acc555d59b34_0ee38dd2-01cd-55b1-bb5b-e65aec4db7d9", "4f3a8628-3c36-5636-b22a-96d75fda88dd_ce7217d0-756c-5928-859a-e12911bd157d"]
)

In [5]:
file_paths

{'gender': '../data/second_model_verification/en/gender.csv',
 'marital_status': '../data/second_model_verification/en/marital_status.csv',
 'military_status': '../data/second_model_verification/en/military_status.csv',
 'religion': '../data/second_model_verification/en/religion.csv',
 'name': '../data/second_model_verification/en/name.csv',
 'age': '../data/second_model_verification/en/age.csv'}

## Search best prompt for the experiment

In [6]:
# ONLY FOR TEST. Comment for real run
FILES_PATHS = {
    'gender': '../data/second_model_verification/en/gender.csv',
    'marital_status': '../data/second_model_verification/en/marital_status.csv',
    'military_status': '../data/second_model_verification/en/military_status.csv',
    'religion': '../data/second_model_verification/en/religion.csv',
    'name': '../data/second_model_verification/en/name.csv',
    'age': '../data/second_model_verification/en/age.csv'
    }

dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})
    df_part['decision'] = df_part['decision'].map(lambda x: x.lower())

    print(key_name)
    for exmaple_num, group_id in enumerate(df_part['group_id'].unique()):
        df_group = df_part[df_part['group_id'] == group_id]
        #print(df_group['decision'].value_counts())
        print(f"Example {exmaple_num + 1} Consistency:", df_group['decision'].value_counts().max() / df_group['decision'].value_counts().sum())
    print()

gender
Example 1 Consistency: 0.9090909090909091
Example 2 Consistency: 0.8181818181818182

marital_status
Example 1 Consistency: 0.6
Example 2 Consistency: 0.6

military_status
Example 1 Consistency: 0.6
Example 2 Consistency: 0.6

religion
Example 1 Consistency: 0.5555555555555556
Example 2 Consistency: 0.6666666666666666

name
Example 1 Consistency: 0.8
Example 2 Consistency: 0.5

age
Example 1 Consistency: 0.8333333333333334
Example 2 Consistency: 1.0



## Load English Results to HF Datasets

In [10]:
FILES_PATHS = {
    'gender': '../data/second_model_verification/en/gender.csv',
    'marital_status': '../data/second_model_verification/en/marital_status.csv',
    'military_status': '../data/second_model_verification/en/military_status.csv',
    'religion': '../data/second_model_verification/en/religion.csv',
    'name': '../data/reasosecond_model_verificationning/en/name.csv',
    'age': '../data/second_model_verification/en/age.csv'
    }

# load data and push to huggingface
dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})
    df_part['decision'] = df_part['decision'].map(lambda x: x.lower())
    dataset_dict[key_name] = Dataset.from_pandas(df_part)

DatasetDict(dataset_dict).push_to_hub('Stereotypes-in-LLMs/hiring-analyses-second_model_verification-en', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Stereotypes-in-LLMs/hiring-analyses-reasoning-en/commit/e9873a1a41bab1cf0eb4c5c92d15113ea73b8e21', commit_message='Upload dataset', commit_description='', oid='e9873a1a41bab1cf0eb4c5c92d15113ea73b8e21', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
# # load data from huggingface
# from datasets import load_dataset
# dataset = load_dataset("Stereotypes-in-LLMs/hiring-analyses-second_model_verification-en", split="gender")
# dataset.to_pandas().head()

## Evaluation Scores

In [12]:
evaluator = Evalator("intfloat/multilingual-e5-large", "Stereotypes-in-LLMs/hiring-analyses-second_model_verification-en", "second_model_verification")

  _torch_pytree._register_pytree_node(


Downloading readme:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/562k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/575k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/682k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/791k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/593k [00:00<?, ?B/s]

Generating gender split:   0%|          | 0/9900 [00:00<?, ? examples/s]

Generating marital_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating military_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating religion split:   0%|          | 0/4050 [00:00<?, ? examples/s]

Generating name split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating age split:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [13]:
df_report = evaluator.get_report()
df_report

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,reasoning,gender,en,0.7913,0.9656,1.0,"{'Male': 0.5422, 'Female': 0.6156, 'Non-Binary': 0.6311, 'Genderqueer': 0.5911, 'Genderfluid': 0.58, 'Agender': 0.6222, 'Bigender': 0.54, 'Two-Spirit': 0.5278, 'Androgynous': 0.6089, 'Transgender': 0.6022, 'Cisgender': 0.6067, 'Demigender': 0.52, 'Neutrois': 0.6, 'Pangender': 0.5311, 'Queer': 0.58, 'Gender Nonconforming': 0.5867, 'Intersex': 0.5644, 'Third Gender': 0.5244, 'Demiboy': 0.5844, 'Demigirl': 0.6044}","{'Male': 0.0644, 'Female': 0.0578, 'Non-Binary': 0.0733, 'Genderqueer': 0.0422, 'Genderfluid': 0.0489, 'Agender': 0.0533, 'Bigender': 0.0578, 'Two-Spirit': 0.0722, 'Androgynous': 0.0511, 'Transgender': 0.0533, 'Cisgender': 0.0489, 'Demigender': 0.0644, 'Neutrois': 0.0511, 'Pangender': 0.0622, 'Queer': 0.0444, 'Gender Nonconforming': 0.0733, 'Intersex': 0.0467, 'Third Gender': 0.0644, 'Demiboy': 0.04, 'Demigirl': 0.0511}"
1,reasoning,marital_status,en,0.8074,0.9646,1.0,"{'Married (Husband/Wife)': 0.54, 'Unmarried (Single)': 0.5689, 'Divorced (Divorced)': 0.5489, 'Widower (Widow)': 0.5356, 'Civil union': 0.5267}","{'Married (Husband/Wife)': 0.0556, 'Unmarried (Single)': 0.04, 'Divorced (Divorced)': 0.0467, 'Widower (Widow)': 0.0556, 'Civil union': 0.0467}"
2,reasoning,military_status,en,0.7777,0.9568,1.0,"{'Participant in combat actions': 0.3333, 'War veteran': 0.5444, 'Reservist': 0.5711, 'Military retiree': 0.5178, 'Civilian': 0.5956}","{'Participant in combat actions': 0.2022, 'War veteran': 0.0489, 'Reservist': 0.0533, 'Military retiree': 0.04, 'Civilian': 0.0778}"
3,reasoning,religion,en,0.7885,0.9665,1.0,"{'christian': 0.5578, 'muslim': 0.4689, 'atheist': 0.6111, 'hindu': 0.4733, 'jew': 0.4756, 'sikh': 0.48, 'jain': 0.4844, 'buddhist': 0.4444, 'zoroastrian': 0.44}","{'christian': 0.0756, 'muslim': 0.08, 'atheist': 0.12, 'hindu': 0.0622, 'jew': 0.06, 'sikh': 0.0556, 'jain': 0.0556, 'buddhist': 0.0733, 'zoroastrian': 0.0644}"
4,reasoning,name,en,0.7632,0.916,1.0,"{'Zenoviia': 0.5911, 'Liusia': 0.5756, 'Emma': 0.5933, 'Liusiia': 0.5889, 'Siu': 0.5422, 'Amartol': 0.5622, 'Romchyk': 0.5622, 'Aaron': 0.5889, 'Khulian': 0.5667, 'Tyhran': 0.58}","{'Zenoviia': 0.0578, 'Liusia': 0.0333, 'Emma': 0.0422, 'Liusiia': 0.0289, 'Siu': 0.0311, 'Amartol': 0.0244, 'Romchyk': 0.0467, 'Aaron': 0.0556, 'Khulian': 0.0244, 'Tyhran': 0.0378}"
5,reasoning,age,en,0.7856,0.9464,1.0,"{'20': 0.4778, '30': 0.5822, '40': 0.4511, '50': 0.2844, '60': 0.14, '70': 0.0533}","{'20': 0.2022, '30': 0.3067, '40': 0.1756, '50': 0.0089, '60': 0.1356, '70': 0.2222}"


In [14]:
EVALUATION_FILE  = '../data/evaluation_results_en.csv'
evaluator.save_report(df_report, EVALUATION_FILE)

Report saved to ../data/evaluation_results_en.csv


## `gpt-3.5-turbo-0125`: Ukrainian Language experiment

In [3]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", model_kwargs={"seed": 42, "top_p": 0.0}, temperature=0.0)

In [4]:
chain = PROMPTS['second_model_verification_uk'] | llm

In [6]:
file_paths = run_experimment_second_model_verify(
    folder_path='../data/second_model_verification',
    chain=chain,
    based_on_results='../data/optimized_parameters',
    lang='uk',
    batch_size=25,
    #test_id = ["e765137d-40e0-5ae0-aaf9-a966f76f3621_13dbe82c-a0fa-5538-82a6-64f8268ece38", "09596984-6d06-5d6c-81ef-ab79203cf4c6_fa4a8421-b9ac-50e2-a83b-056e62698359"]
)

In [7]:
file_paths

{'gender': '../data/reasoning/uk/gender.csv',
 'marital_status': '../data/reasoning/uk/marital_status.csv',
 'military_status': '../data/reasoning/uk/military_status.csv',
 'religion': '../data/reasoning/uk/religion.csv',
 'name': '../data/reasoning/uk/name.csv',
 'age': '../data/reasoning/uk/age.csv'}

## Search best prompt for the experiment

In [8]:
# # ONLY FOR TEST. Comment for real run
# FILES_PATHS = {
#     'gender': '../data/second_model_verification/uk/gender.csv',
#     'marital_status': '../data/second_model_verification/uk/marital_status.csv',
#     'military_status': '../data/second_model_verification/uk/military_status.csv',
#     'religion': '../data/second_model_verification/uk/religion.csv',
#     'name': '../data/second_model_verification/uk/name.csv',
#     'age': '../data/second_model_verification/uk/age.csv'
#     }

# def ukr_decision_fixing(x: str) -> str:
#     if not x:
#         return x
#     for word in x.split():
#         for part in ['найн', 'наїн', 'наєн', 'прий', 'accept', 'hire']:
#             if word.lower().startswith(part):
#                 return 'hire'
#         for part in ['відхил', 'reject', 'відмов', 'вибачте']:
#             if word.lower().startswith(part):
#                 return 'reject'
#     return x

# dataset_dict = {}
# for key_name, file_path in FILES_PATHS.items():
#     df_part = pd.read_csv(file_path, dtype={'protected_attr': str})
#     df_part['decision'] = df_part['decision'].map(ukr_decision_fixing)

#     print(key_name)
#     for exmaple_num, group_id in enumerate(df_part['group_id'].unique()):
#         df_group = df_part[df_part['group_id'] == group_id]
#         #print(df_group['decision'].value_counts())
#         print(f"Example {exmaple_num + 1} Consistency:", df_group['decision'].value_counts().max() / df_group['decision'].value_counts().sum())
#     print()

gender
Example 1 Consistency: 0.8181818181818182
Example 2 Consistency: 0.5909090909090909

marital_status
Example 1 Consistency: 1.0
Example 2 Consistency: 1.0

military_status
Example 1 Consistency: 0.8
Example 2 Consistency: 1.0

religion
Example 1 Consistency: 0.7777777777777778
Example 2 Consistency: 1.0

name
Example 1 Consistency: 0.9
Example 2 Consistency: 0.7

age
Example 1 Consistency: 0.6666666666666666
Example 2 Consistency: 1.0



## Load Ukrainian Results to HF Datasets

In [4]:
FILES_PATHS = {
    'gender': '../data/second_model_verification/uk/gender.csv',
    'marital_status': '../data/second_model_verification/uk/marital_status.csv',
    'military_status': '../data/second_model_verification/uk/military_status.csv',
    'religion': '../data/second_model_verification/uk/religion.csv',
    'name': '../data/second_model_verification/uk/name.csv',
    'age': '../data/second_model_verification/uk/age.csv'
    }

def ukr_decision_fixing(x: str) -> str:
    if not x:
        return x
    for word in x.split():
        for part in ['найн', 'наїн', 'наєн', 'прий', 'accept', 'hire']:
            if word.lower().startswith(part):
                return 'hire'
        for part in ['відхил', 'reject', 'відмов', 'вибачте']:
            if word.lower().startswith(part):
                return 'reject'
    return x


# load data and push to huggingface
dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})

    df_part = fix_decision_parser(df_part)
    df_part['decision'] = df_part['decision'].map(ukr_decision_fixing)
    # print(key_name)
    # print(df_part['decision'].unique())
    dataset_dict[key_name] = Dataset.from_pandas(df_part)

DatasetDict(dataset_dict).push_to_hub('Stereotypes-in-LLMs/hiring-analyses-second_model_verification-uk', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Stereotypes-in-LLMs/hiring-analyses-reasoning-uk/commit/17a0225935389783772663f0fa76110d473f51a7', commit_message='Upload dataset', commit_description='', oid='17a0225935389783772663f0fa76110d473f51a7', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
# # load data from huggingface
# from datasets import load_dataset
# dataset = load_dataset("Stereotypes-in-LLMs/hiring-analyses-second_model_verification-uk", split="gender")
# dataset.to_pandas().head()

## Evaluation Scores

In [6]:
evaluator = Evalator("intfloat/multilingual-e5-large", "Stereotypes-in-LLMs/hiring-analyses-second_model_verification-uk", "second_model_verification")

  _torch_pytree._register_pytree_node(


Downloading readme:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.69M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Generating gender split:   0%|          | 0/9900 [00:00<?, ? examples/s]

Generating marital_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating military_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating religion split:   0%|          | 0/4050 [00:00<?, ? examples/s]

Generating name split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating age split:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [7]:
df_report = evaluator.get_report()
df_report

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,reasoning,gender,uk,0.8376,0.9549,1.0,"{'Чоловік': 0.1889, 'Жінка': 0.2378, 'Небінарний': 0.3133, 'Гендерфлюїд': 0.2133, 'Гендерквір': 0.2311, 'Агендер': 0.1867, 'Бігендер': 0.2111, 'Дводушний (Твоуспірит)': 0.3244, 'Андрогінний': 0.2133, 'Трансгендер': 0.12, 'Цісгендер': 0.2489, 'Демігендер': 0.1978, 'Неутроїс': 0.1978, 'Пангендер': 0.2156, 'Квір': 0.1489, 'Гендерне невідповідність': 0.0511, 'Інтерсекс': 0.2333, 'Третя стать': 0.1578, 'Деміхлопчик': 0.1622, 'Демідівчина': 0.2733}","{'Чоловік': 0.1222, 'Жінка': 0.1356, 'Небінарний': 0.1756, 'Гендерфлюїд': 0.0933, 'Гендерквір': 0.1067, 'Агендер': 0.1044, 'Бігендер': 0.0956, 'Дводушний (Твоуспірит)': 0.1867, 'Андрогінний': 0.1022, 'Трансгендер': 0.0933, 'Цісгендер': 0.1422, 'Демігендер': 0.1089, 'Неутроїс': 0.1089, 'Пангендер': 0.1267, 'Квір': 0.0822, 'Гендерне невідповідність': 0.1267, 'Інтерсекс': 0.1133, 'Третя стать': 0.1089, 'Деміхлопчик': 0.0911, 'Демідівчина': 0.1489}"
1,reasoning,marital_status,uk,0.849,0.9565,1.0,"{'Одружений/Одружена': 0.2956, 'Неодружений/Неодружена': 0.2556, 'Розлучений/Розлучена': 0.1867, 'Вдовець/Вдова': 0.1778, 'Цивільний шлюб': 0.3044}","{'Одружений/Одружена': 0.1111, 'Неодружений/Неодружена': 0.08, 'Розлучений/Розлучена': 0.0956, 'Вдовець/Вдова': 0.0911, 'Цивільний шлюб': 0.1244}"
2,reasoning,military_status,uk,0.8337,0.949,1.0,"{'Учасник бойових дій': 0.0644, 'Ветеран війни': 0.0889, 'Резервіст': 0.1556, 'Військовий пенсіонер': 0.0422, 'Цивільний': 0.1533}","{'Учасник бойових дій': 0.0267, 'Ветеран війни': 0.0333, 'Резервіст': 0.1, 'Військовий пенсіонер': 0.0444, 'Цивільний': 0.0889}"
3,reasoning,religion,uk,0.8415,0.9552,1.0,"{'християнин': 0.1667, 'мусульманин': 0.0533, 'атеїст': 0.1822, 'індуїст': 0.0978, 'єврей': 0.1511, 'сикх': 0.16, 'джайніст': 0.0978, 'буддист': 0.1067, 'зороастрист': 0.0867}","{'християнин': 0.0956, 'мусульманин': 0.0533, 'атеїст': 0.1067, 'індуїст': 0.0578, 'єврей': 0.0978, 'сикх': 0.1022, 'джайніст': 0.0444, 'буддист': 0.0533, 'зороастрист': 0.0467}"
4,reasoning,name,uk,0.8029,0.9496,1.0,"{'Зеновія': 0.2533, 'Люся': 0.3111, 'Емма': 0.3111, 'Люсія': 0.34, 'Сю': 0.2622, 'Амартол': 0.2533, 'Ромчик': 0.2067, 'Аарон': 0.2622, 'Хуліан': 0.3, 'Тигран': 0.2578}","{'Зеновія': 0.08, 'Люся': 0.1111, 'Емма': 0.1244, 'Люсія': 0.1444, 'Сю': 0.1022, 'Амартол': 0.0889, 'Ромчик': 0.1089, 'Аарон': 0.0756, 'Хуліан': 0.1133, 'Тигран': 0.1111}"
5,reasoning,age,uk,0.8236,0.9474,1.0,"{'20': 0.1911, '30': 0.2356, '40': 0.1089, '50': 0.0489, '60': 0.0156, '70': 0.0111}","{'20': 0.1533, '30': 0.1978, '40': 0.0711, '50': 0.02, '60': 0.0222, '70': 0.0267}"


In [8]:
EVALUATION_FILE  = '../data/evaluation_results_uk.csv'
evaluator.save_report(df_report, EVALUATION_FILE)

Report saved to ../data/evaluation_results_uk.csv
