# Zero Shot COT Experiments

In [None]:
# %pip install langchain langchain-core langchain-community langchain-openai
# %pip install --force-reinstall typing-extensions==4.5
# %pip install --force-reinstall openai==1.8

## Load Libraries

In [1]:
import os
import sys
import pandas as pd
sys.path.append('..')
from src.prompt import PROMPTS
from src.evaluation import Evalator
from src.helpers import fix_decision_parser
from src.experiment_runner import run_experiment

from huggingface_hub import login
from datasets import Dataset, DatasetDict

pd.set_option('display.max_colwidth', None)

login(token=os.environ.get("HF_TOKEN"), add_to_git_credential=True)

  _torch_pytree._register_pytree_node(


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/nazardrushchak/.cache/huggingface/token
Login successful


## Load Data

In [2]:
df_uk = pd.read_csv('../data/uk_data_samples.csv')
df_en = pd.read_csv('../data/en_data_samples.csv')

## `gpt-3.5-turbo-0125`: English Language experiment

In [3]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", model_kwargs={"seed": 42, "top_p": 0.0}, temperature=0.0)

In [4]:
chain = PROMPTS['zero-shot-cot_en'] | llm

In [5]:
file_paths = run_experiment(
    folder_path='../data/zero_shot_cot',
    chain=chain,
    data=df_en,
    lang='en',
    batch_size=5,
)

In [6]:
file_paths

{'gender': '../data/zero_shot_cot/en/gender.csv',
 'marital_status': '../data/zero_shot_cot/en/marital_status.csv',
 'military_status': '../data/zero_shot_cot/en/military_status.csv',
 'religion': '../data/zero_shot_cot/en/religion.csv',
 'name': '../data/zero_shot_cot/en/name.csv',
 'age': '../data/zero_shot_cot/en/age.csv'}

## Load English Results to HF Datasets

In [5]:
FILES_PATHS = {
    'gender': '../data/zero_shot_cot/en/gender.csv',
    'marital_status': '../data/zero_shot_cot/en/marital_status.csv',
    'military_status': '../data/zero_shot_cot/en/military_status.csv',
    'religion': '../data/zero_shot_cot/en/religion.csv',
    'name': '../data/zero_shot_cot/en/name.csv',
    'age': '../data/zero_shot_cot/en/age.csv'
    }

# load data and push to huggingface
dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})

    # FIX ONE ERROR IN THE DATA. Better to rerun and fix in dataset generation
    if key_name == 'military_status':
        import json
        for ind in df_part[df_part.decision.isna()].index:
            val = df_part.loc[ind].to_dict()
            result = chain.batch([{"job_desc": val["Job Description"], "candidate_cv": val["CV"], "protected_group": "military_status", "protected_attr": val["protected_attr"]}])[0]
            result = json.loads("{"+ result.content.split("{")[-1].split("}")[0] + "}")
            df_part.loc[ind, 'decision'] = result['decision']
            df_part.loc[ind, 'feedback'] = result['feedback']
            df_part.loc[ind, 'raw_ai_decision'] = json.dumps(result)

    df_part['decision'] = df_part['decision'].map(lambda x: x.lower())
    df_part = df_part.groupby(by=['candidate_id', 'job_id','CV','Job Description', 'Job Position', 'lang', 'protected_group', 'protected_attr', 'group_id']).agg({
        "decision": "first",
        "feedback": "first",
        "raw_ai_decision": "first",
    }).reset_index()
    dataset_dict[key_name] = Dataset.from_pandas(df_part)

DatasetDict(dataset_dict).push_to_hub('Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-en', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-en/commit/96e20e9bfdd56ea62f95afcc8ec4c8d773546937', commit_message='Upload dataset', commit_description='', oid='96e20e9bfdd56ea62f95afcc8ec4c8d773546937', pr_url=None, pr_revision=None, pr_num=None)

In [6]:
# # load data from huggingface
# from datasets import load_dataset
# dataset = load_dataset("Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-en", split="gender")
# dataset.to_pandas().head()

## Evaluation Scores

In [7]:
evaluator = Evalator("intfloat/multilingual-e5-large", "Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-en", "zero_shot_cot")

  _torch_pytree._register_pytree_node(


Downloading readme:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/696k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/845k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/922k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/747k [00:00<?, ?B/s]

Generating gender split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating marital_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating military_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating religion split:   0%|          | 0/4050 [00:00<?, ? examples/s]

Generating name split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating age split:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [8]:
df_report = evaluator.get_report()
df_report

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,zero_shot_cot,gender,en,0.7901,0.9669,1.0,"{'Agender': 0.5778, 'Androgynous': 0.5778, 'Bigender': 0.5222, 'Cisgender': 0.5556, 'Demiboy': 0.54, 'Demigender': 0.48, 'Demigirl': 0.5578, 'Female': 0.56, 'Gender Nonconforming': 0.5378, 'Genderfluid': 0.5467, 'Genderqueer': 0.56, 'Intersex': 0.5178, 'Male': 0.4867, 'Neutrois': 0.5556, 'Non-Binary': 0.5822, 'Pangender': 0.4867, 'Queer': 0.5333, 'Third Gender': 0.48, 'Transgender': 0.5467, 'Two-Spirit': 0.4978}","{'Agender': 0.0556, 'Androgynous': 0.0644, 'Bigender': 0.0444, 'Cisgender': 0.0556, 'Demiboy': 0.0356, 'Demigender': 0.0911, 'Demigirl': 0.0533, 'Female': 0.0556, 'Gender Nonconforming': 0.0467, 'Genderfluid': 0.0467, 'Genderqueer': 0.0378, 'Intersex': 0.0489, 'Male': 0.08, 'Neutrois': 0.0467, 'Non-Binary': 0.06, 'Pangender': 0.0711, 'Queer': 0.0556, 'Third Gender': 0.0778, 'Transgender': 0.0511, 'Two-Spirit': 0.0556}"
1,zero_shot_cot,marital_status,en,0.8057,0.967,1.0,"{'Civil union': 0.4667, 'Divorced (Divorced)': 0.5111, 'Married (Husband/Wife)': 0.5244, 'Unmarried (Single)': 0.5267, 'Widower (Widow)': 0.5067}","{'Civil union': 0.06, 'Divorced (Divorced)': 0.0378, 'Married (Husband/Wife)': 0.0467, 'Unmarried (Single)': 0.0444, 'Widower (Widow)': 0.0511}"
2,zero_shot_cot,military_status,en,0.783,0.955,1.0,"{'Civilian': 0.5533, 'Military retiree': 0.4689, 'Participant in combat actions': 0.2711, 'Reservist': 0.5289, 'War veteran': 0.48}","{'Civilian': 0.0889, 'Military retiree': 0.04, 'Participant in combat actions': 0.2022, 'Reservist': 0.0689, 'War veteran': 0.0378}"
3,zero_shot_cot,religion,en,0.7901,0.9679,1.0,"{'atheist': 0.5489, 'buddhist': 0.4111, 'christian': 0.5111, 'hindu': 0.44, 'jain': 0.44, 'jew': 0.44, 'muslim': 0.4156, 'sikh': 0.4556, 'zoroastrian': 0.4178}","{'atheist': 0.1022, 'buddhist': 0.0533, 'christian': 0.0689, 'hindu': 0.0467, 'jain': 0.0733, 'jew': 0.0556, 'muslim': 0.0667, 'sikh': 0.0311, 'zoroastrian': 0.0644}"
4,zero_shot_cot,name,en,0.771,0.9147,1.0,"{'Aaron': 0.5289, 'Amartol': 0.5244, 'Emma': 0.5667, 'Khulian': 0.5289, 'Liusia': 0.54, 'Liusiia': 0.5489, 'Romchyk': 0.5156, 'Siu': 0.5133, 'Tyhran': 0.5422, 'Zenoviia': 0.5289}","{'Aaron': 0.0511, 'Amartol': 0.0333, 'Emma': 0.0533, 'Khulian': 0.0289, 'Liusia': 0.0444, 'Liusiia': 0.0444, 'Romchyk': 0.0556, 'Siu': 0.0533, 'Tyhran': 0.0378, 'Zenoviia': 0.0333}"
5,zero_shot_cot,age,en,0.7798,0.9433,1.0,"{'20': 0.4356, '30': 0.5489, '40': 0.4244, '50': 0.2911, '60': 0.16, '70': 0.0533}","{'20': 0.16, '30': 0.2733, '40': 0.1489, '50': 0.0156, '60': 0.1156, '70': 0.2222}"


In [9]:
EVALUATION_FILE  = '../data/evaluation_results_en.csv'
evaluator.save_report(df_report, EVALUATION_FILE)

Report saved to ../data/evaluation_results_en.csv


## `gpt-3.5-turbo-0125`: Ukrainian Language experiment

In [22]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", model_kwargs={"seed": 42, "top_p": 0.0}, temperature=0.0)

In [23]:
chain = PROMPTS['zero-shot-cot_uk'] | llm

In [24]:
file_paths = run_experiment(
    folder_path='../data/zero_shot_cot',
    chain=chain,
    data=df_uk,
    lang='uk',
    batch_size=2,
)

In [25]:
file_paths

{'gender': '../data/zero_shot_cot/uk/gender.csv',
 'marital_status': '../data/zero_shot_cot/uk/marital_status.csv',
 'military_status': '../data/zero_shot_cot/uk/military_status.csv',
 'religion': '../data/zero_shot_cot/uk/religion.csv',
 'name': '../data/zero_shot_cot/uk/name.csv',
 'age': '../data/zero_shot_cot/uk/age.csv'}

## Load Ukrainian Results to HF Datasets

In [10]:
FILES_PATHS = {
    'gender': '../data/zero_shot_cot/uk/gender.csv',
    'marital_status': '../data/zero_shot_cot/uk/marital_status.csv',
    'military_status': '../data/zero_shot_cot/uk/military_status.csv',
    'religion': '../data/zero_shot_cot/uk/religion.csv',
    'name': '../data/zero_shot_cot/uk/name.csv',
    'age': '../data/zero_shot_cot/uk/age.csv'
    }

def ukr_decision_fixing(x: str) -> str:
    if not x:
        return x
    for word in x.split():
        for part in ['найн', 'наїн', 'наєн', 'прий', 'accept', 'hire']:
            if word.lower().startswith(part):
                return 'hire'
        for part in ['відхил', 'reject', 'відмов', 'вибачте']:
            if word.lower().startswith(part):
                return 'reject'
    return x


# load data and push to huggingface
dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})

    df_part = fix_decision_parser(df_part)
    df_part['decision'] = df_part['decision'].map(ukr_decision_fixing)
    df_part = df_part.groupby(by=['candidate_id', 'job_id','CV','Job Description', 'Job Position', 'lang', 'protected_group', 'protected_attr', 'group_id']).agg({
        "decision": "first",
        "feedback": "first",
        "raw_ai_decision": "first",
    }).reset_index()
    dataset_dict[key_name] = Dataset.from_pandas(df_part)

DatasetDict(dataset_dict).push_to_hub('Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-uk', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-uk/commit/ef5dacd5449807c5f915c0032a2267a61e24dcfc', commit_message='Upload dataset', commit_description='', oid='ef5dacd5449807c5f915c0032a2267a61e24dcfc', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
# # load data from huggingface
# from datasets import load_dataset
# dataset = load_dataset("Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-uk", split="gender")
# dataset.to_pandas().head()

## Evaluation Scores

In [12]:
evaluator = Evalator("intfloat/multilingual-e5-large", "Stereotypes-in-LLMs/hiring-analyses-zero_shot_cot-uk", "zero_shot_cot")

Downloading readme:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/958k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/984k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

Generating gender split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating marital_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating military_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating religion split:   0%|          | 0/4050 [00:00<?, ? examples/s]

Generating name split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating age split:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [13]:
df_report = evaluator.get_report()
df_report

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,zero_shot_cot,gender,uk,0.8347,0.9654,1.0,"{'Інтерсекс': 0.0467, 'Агендер': 0.0333, 'Андрогінний': 0.0444, 'Бігендер': 0.0444, 'Гендерквір': 0.0378, 'Гендерне невідповідність': 0.0244, 'Гендерфлюїд': 0.0556, 'Дводушний (Твоуспірит)': 0.0978, 'Демігендер': 0.0489, 'Демідівчина': 0.0778, 'Деміхлопчик': 0.0356, 'Квір': 0.04, 'Небінарний': 0.0733, 'Неутроїс': 0.0667, 'Пангендер': 0.0467, 'Трансгендер': 0.0356, 'Третя стать': 0.0333, 'Цісгендер': 0.0533, 'Чоловік': 0.0378, 'Жінка': 0.0444}","{'Інтерсекс': 0.0178, 'Агендер': 0.0222, 'Андрогінний': 0.0156, 'Бігендер': 0.02, 'Гендерквір': 0.0133, 'Гендерне невідповідність': 0.0089, 'Гендерфлюїд': 0.0311, 'Дводушний (Твоуспірит)': 0.0689, 'Демігендер': 0.02, 'Демідівчина': 0.0533, 'Деміхлопчик': 0.0156, 'Квір': 0.0156, 'Небінарний': 0.0489, 'Неутроїс': 0.0378, 'Пангендер': 0.0222, 'Трансгендер': 0.0156, 'Третя стать': 0.0222, 'Цісгендер': 0.0244, 'Чоловік': 0.0133, 'Жінка': 0.0244}"
1,zero_shot_cot,marital_status,uk,0.8688,0.9649,1.0,"{'Вдовець/Вдова': 0.0489, 'Неодружений/Неодружена': 0.0644, 'Одружений/Одружена': 0.0533, 'Розлучений/Розлучена': 0.04, 'Цивільний шлюб': 0.0822}","{'Вдовець/Вдова': 0.02, 'Неодружений/Неодружена': 0.0267, 'Одружений/Одружена': 0.02, 'Розлучений/Розлучена': 0.0289, 'Цивільний шлюб': 0.0356}"
2,zero_shot_cot,military_status,uk,0.8509,0.9601,1.0,"{'Ветеран війни': 0.0356, 'Військовий пенсіонер': 0.0222, 'Резервіст': 0.0378, 'Учасник бойових дій': 0.0267, 'Цивільний': 0.0444}","{'Ветеран війни': 0.0022, 'Військовий пенсіонер': 0.0156, 'Резервіст': 0.0178, 'Учасник бойових дій': 0.0067, 'Цивільний': 0.0156}"
3,zero_shot_cot,religion,uk,0.8586,0.963,1.0,"{'атеїст': 0.0378, 'буддист': 0.0333, 'джайніст': 0.0267, 'зороастрист': 0.0222, 'мусульманин': 0.02, 'сикх': 0.0311, 'християнин': 0.0356, 'єврей': 0.04, 'індуїст': 0.0378}","{'атеїст': 0.0111, 'буддист': 0.0111, 'джайніст': 0.0044, 'зороастрист': 0.0044, 'мусульманин': 0.0067, 'сикх': 0.0133, 'християнин': 0.0178, 'єврей': 0.0178, 'індуїст': 0.0111}"
4,zero_shot_cot,name,uk,0.793,0.9617,1.0,"{'Аарон': 0.0822, 'Амартол': 0.0711, 'Емма': 0.0867, 'Зеновія': 0.0778, 'Люся': 0.0822, 'Люсія': 0.0978, 'Ромчик': 0.0578, 'Сю': 0.08, 'Тигран': 0.0689, 'Хуліан': 0.0889}","{'Аарон': 0.04, 'Амартол': 0.0378, 'Емма': 0.04, 'Зеновія': 0.0311, 'Люся': 0.0444, 'Люсія': 0.0556, 'Ромчик': 0.0244, 'Сю': 0.0289, 'Тигран': 0.04, 'Хуліан': 0.0422}"
5,zero_shot_cot,age,uk,0.8516,0.9574,1.0,"{'20': 0.0556, '30': 0.0622, '40': 0.0244, '50': 0.0156, '60': 0.0044, '70': 0.0044}","{'20': 0.0444, '30': 0.0511, '40': 0.0133, '50': 0.0044, '60': 0.0067, '70': 0.0067}"


In [14]:
EVALUATION_FILE  = '../data/evaluation_results_uk.csv'
evaluator.save_report(df_report, EVALUATION_FILE)

Report saved to ../data/evaluation_results_uk.csv


: 