# Optimized Parameters Experiments

In [9]:
# %pip install langchain langchain-core langchain-community langchain-openai
# %pip install --force-reinstall typing-extensions==4.5
# %pip install --force-reinstall openai==1.8

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
Collecting typing-extensions<5,>=4.7 (from openai<2.0.0,>=1.10.0->langchain-openai)
  Using cached typing_extensions-4.10.0-py3-none-any.whl.metadata (3.0 kB)
Using cached typing_extensions-4.10.0-py3-none-any.whl (33 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issue

## Load Libraries

In [1]:
import os
import sys
import pandas as pd
sys.path.append('..')
from src.prompt import PROMPTS
from src.evaluation import Evalator
from src.helpers import fix_decision_parser
from src.experiment_runner import run_experiment

from huggingface_hub import login
from datasets import Dataset, DatasetDict

pd.set_option('display.max_colwidth', None)

login(token=os.environ.get("HF_TOKEN"), add_to_git_credential=True)

  _torch_pytree._register_pytree_node(


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/nazardrushchak/.cache/huggingface/token
Login successful


## Load Data

In [2]:
df_uk = pd.read_csv('../data/uk_data_samples.csv')
df_en = pd.read_csv('../data/en_data_samples.csv')

## `gpt-3.5-turbo-0125`: English Language experiment

In [3]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", model_kwargs={"seed": 42, "top_p": 0.0}, temperature=0.0)

In [4]:
chain = PROMPTS['baseline_prompt_en'] | llm

In [5]:
file_paths = run_experiment(
    folder_path='../data/optimized_parameters',
    chain=chain,
    data=df_en,
    lang='en',
    batch_size=50,
)

In [6]:
file_paths

{'gender': '../data/optimized_parameters/en/gender.csv',
 'marital_status': '../data/optimized_parameters/en/marital_status.csv',
 'military_status': '../data/optimized_parameters/en/military_status.csv',
 'religion': '../data/optimized_parameters/en/religion.csv',
 'name': '../data/optimized_parameters/en/name.csv',
 'age': '../data/optimized_parameters/en/age.csv'}

## Load English Results to HF Datasets

In [2]:
FILES_PATHS = {
    'gender': '../data/optimized_parameters/en/gender.csv',
    'marital_status': '../data/optimized_parameters/en/marital_status.csv',
    'military_status': '../data/optimized_parameters/en/military_status.csv',
    'religion': '../data/optimized_parameters/en/religion.csv',
    'name': '../data/optimized_parameters/en/name.csv',
    'age': '../data/optimized_parameters/en/age.csv'
    }

# load data and push to huggingface
dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})
    df_part['decision'] = df_part['decision'].map(lambda x: x.lower())
    df_part = df_part.groupby(by=['candidate_id', 'job_id','CV','Job Description', 'Job Position', 'lang', 'protected_group', 'protected_attr', 'group_id']).agg({
        "decision": "first",
        "feedback": "first",
        "raw_ai_decision": "first",
    }).reset_index()
    dataset_dict[key_name] = Dataset.from_pandas(df_part)

DatasetDict(dataset_dict).push_to_hub('Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-en', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-en/commit/4414b68e7cf0e0cbfafb34993909c8233fe9bb54', commit_message='Upload dataset', commit_description='', oid='4414b68e7cf0e0cbfafb34993909c8233fe9bb54', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
# # load data from huggingface
# from datasets import load_dataset
# dataset = load_dataset("Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-en", split="gender")
# dataset.to_pandas().head()

## Evaluation Scores

In [4]:
evaluator = Evalator("intfloat/multilingual-e5-large", "Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-en", "optimized_parameters")

  _torch_pytree._register_pytree_node(


Downloading readme:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/693k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/709k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/748k [00:00<?, ?B/s]

Generating gender split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating marital_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating military_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating religion split:   0%|          | 0/4050 [00:00<?, ? examples/s]

Generating name split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating age split:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [5]:
df_report = evaluator.get_report()
df_report

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,optimized_parameters,gender,en,0.7971,0.9622,1.0,"{'Agender': 0.6111, 'Androgynous': 0.6044, 'Bigender': 0.5311, 'Cisgender': 0.5956, 'Demiboy': 0.5778, 'Demigender': 0.52, 'Demigirl': 0.58, 'Female': 0.5978, 'Gender Nonconforming': 0.5822, 'Genderfluid': 0.5756, 'Genderqueer': 0.5911, 'Intersex': 0.5689, 'Male': 0.5356, 'Neutrois': 0.58, 'Non-Binary': 0.6311, 'Pangender': 0.5089, 'Queer': 0.5533, 'Third Gender': 0.5, 'Transgender': 0.5756, 'Two-Spirit': 0.5267}","{'Agender': 0.0578, 'Androgynous': 0.0422, 'Bigender': 0.0489, 'Cisgender': 0.0511, 'Demiboy': 0.0378, 'Demigender': 0.06, 'Demigirl': 0.0533, 'Female': 0.0622, 'Gender Nonconforming': 0.0689, 'Genderfluid': 0.0444, 'Genderqueer': 0.0422, 'Intersex': 0.0378, 'Male': 0.0578, 'Neutrois': 0.0311, 'Non-Binary': 0.0733, 'Pangender': 0.0667, 'Queer': 0.0311, 'Third Gender': 0.0711, 'Transgender': 0.0622, 'Two-Spirit': 0.0667}"
1,optimized_parameters,marital_status,en,0.8074,0.964,1.0,"{'Civil union': 0.4978, 'Divorced (Divorced)': 0.5356, 'Married (Husband/Wife)': 0.5489, 'Unmarried (Single)': 0.5578, 'Widower (Widow)': 0.5267}","{'Civil union': 0.0622, 'Divorced (Divorced)': 0.0333, 'Married (Husband/Wife)': 0.0467, 'Unmarried (Single)': 0.0556, 'Widower (Widow)': 0.0467}"
2,optimized_parameters,military_status,en,0.773,0.9501,1.0,"{'Civilian': 0.5867, 'Military retiree': 0.5067, 'Participant in combat actions': 0.28, 'Reservist': 0.5378, 'War veteran': 0.52}","{'Civilian': 0.0911, 'Military retiree': 0.0378, 'Participant in combat actions': 0.2289, 'Reservist': 0.0556, 'War veteran': 0.0467}"
3,optimized_parameters,religion,en,0.8173,0.9632,1.0,"{'atheist': 0.6022, 'buddhist': 0.4378, 'christian': 0.54, 'hindu': 0.4867, 'jain': 0.4844, 'jew': 0.48, 'muslim': 0.4667, 'sikh': 0.4756, 'zoroastrian': 0.4333}","{'atheist': 0.1267, 'buddhist': 0.0644, 'christian': 0.0778, 'hindu': 0.0689, 'jain': 0.0489, 'jew': 0.0622, 'muslim': 0.0667, 'sikh': 0.0311, 'zoroastrian': 0.0689}"
4,optimized_parameters,name,en,0.7767,0.9061,1.0,"{'Aaron': 0.5644, 'Amartol': 0.5422, 'Emma': 0.5889, 'Khulian': 0.5511, 'Liusia': 0.5689, 'Liusiia': 0.5822, 'Romchyk': 0.5489, 'Siu': 0.5467, 'Tyhran': 0.5556, 'Zenoviia': 0.5667}","{'Aaron': 0.0422, 'Amartol': 0.0378, 'Emma': 0.0489, 'Khulian': 0.0289, 'Liusia': 0.0556, 'Liusiia': 0.0422, 'Romchyk': 0.04, 'Siu': 0.0511, 'Tyhran': 0.0333, 'Zenoviia': 0.0444}"
5,optimized_parameters,age,en,0.7889,0.9361,1.0,"{'20': 0.4511, '30': 0.5689, '40': 0.4156, '50': 0.2689, '60': 0.1333, '70': 0.0578}","{'20': 0.1867, '30': 0.3089, '40': 0.1511, '50': 0.0044, '60': 0.1356, '70': 0.2067}"


In [6]:
EVALUATION_FILE  = '../data/evaluation_results_en.csv'
evaluator.save_report(df_report, EVALUATION_FILE)

Report saved to ../data/evaluation_results_en.csv


## `gpt-3.5-turbo-0125`: Ukrainian Language experiment

In [16]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", model_kwargs={"seed": 42, "top_p": 0.0}, temperature=0.0)
# Note: param because it is logical for companies
# all next optimized param

In [17]:
chain = PROMPTS['baseline_prompt_uk'] | llm

In [18]:
file_paths = run_experiment(
    folder_path='../data/optimized_parameters',
    chain=chain,
    data=df_uk,
    lang='uk',
    batch_size=25,
)

In [19]:
file_paths

{'age': '../data/optimized_parameters/uk/age.csv'}

## Load Ukrainian Results to HF Datasets

In [7]:
# TODO: check reasoning for the decision
FILES_PATHS = {
    'gender': '../data/optimized_parameters/uk/gender.csv',
    'marital_status': '../data/optimized_parameters/uk/marital_status.csv',
    'military_status': '../data/optimized_parameters/uk/military_status.csv',
    'religion': '../data/optimized_parameters/uk/religion.csv',
    'name': '../data/optimized_parameters/uk/name.csv',
    'age': '../data/optimized_parameters/uk/age.csv'
    }

def ukr_decision_fixing(x: str) -> str:
    if not x:
        return x
    for word in x.split():
        for part in ['найн', 'наїн', 'наєн', 'прий', 'accept', 'hire']:
            if word.lower().startswith(part):
                return 'hire'
        for part in ['відхил', 'reject', 'відмов', 'вибачте']:
            if word.lower().startswith(part):
                return 'reject'
    return x


# load data and push to huggingface
dataset_dict = {}
for key_name, file_path in FILES_PATHS.items():
    df_part = pd.read_csv(file_path, dtype={'protected_attr': str})

    df_part = fix_decision_parser(df_part)
    df_part['decision'] = df_part['decision'].map(ukr_decision_fixing)
    df_part = df_part.groupby(by=['candidate_id', 'job_id','CV','Job Description', 'Job Position', 'lang', 'protected_group', 'protected_attr', 'group_id']).agg({
        "decision": "first",
        "feedback": "first",
        "raw_ai_decision": "first",
    }).reset_index()
    dataset_dict[key_name] = Dataset.from_pandas(df_part)

DatasetDict(dataset_dict).push_to_hub('Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-uk', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-uk/commit/7360607205176abc9efbc0b8e6ec233e940419a8', commit_message='Upload dataset', commit_description='', oid='7360607205176abc9efbc0b8e6ec233e940419a8', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
# # load data from huggingface
# from datasets import load_dataset
# dataset = load_dataset("Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-uk", split="gender")
# dataset.to_pandas().head()

## Evaluation Scores

In [9]:
evaluator = Evalator("intfloat/multilingual-e5-large", "Stereotypes-in-LLMs/hiring-analyses-optimized_parameters-uk", "optimized_parameters")

Downloading readme:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/969k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Generating gender split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating marital_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating military_status split:   0%|          | 0/2250 [00:00<?, ? examples/s]

Generating religion split:   0%|          | 0/4050 [00:00<?, ? examples/s]

Generating name split:   0%|          | 0/4500 [00:00<?, ? examples/s]

Generating age split:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [10]:
df_report = evaluator.get_report()
df_report

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,optimized_parameters,gender,uk,0.8354,0.9571,1.0,"{'Інтерсекс': 0.0956, 'Агендер': 0.0889, 'Андрогінний': 0.1222, 'Бігендер': 0.1111, 'Гендерквір': 0.1, 'Гендерне невідповідність': 0.0467, 'Гендерфлюїд': 0.0867, 'Дводушний (Твоуспірит)': 0.1689, 'Демігендер': 0.0978, 'Демідівчина': 0.14, 'Деміхлопчик': 0.0756, 'Квір': 0.06, 'Небінарний': 0.14, 'Неутроїс': 0.1156, 'Пангендер': 0.1111, 'Трансгендер': 0.0422, 'Третя стать': 0.0667, 'Цісгендер': 0.1244, 'Чоловік': 0.0956, 'Жінка': 0.1067}","{'Інтерсекс': 0.0289, 'Агендер': 0.0444, 'Андрогінний': 0.06, 'Бігендер': 0.0578, 'Гендерквір': 0.0556, 'Гендерне невідповідність': 0.0378, 'Гендерфлюїд': 0.0378, 'Дводушний (Твоуспірит)': 0.0978, 'Демігендер': 0.0444, 'Демідівчина': 0.0733, 'Деміхлопчик': 0.04, 'Квір': 0.0244, 'Небінарний': 0.0733, 'Неутроїс': 0.0533, 'Пангендер': 0.0489, 'Трансгендер': 0.0333, 'Третя стать': 0.04, 'Цісгендер': 0.0533, 'Чоловік': 0.0378, 'Жінка': 0.0533}"
1,optimized_parameters,marital_status,uk,0.8562,0.9561,1.0,"{'Вдовець/Вдова': 0.0711, 'Неодружений/Неодружена': 0.1333, 'Одружений/Одружена': 0.1622, 'Розлучений/Розлучена': 0.0778, 'Цивільний шлюб': 0.1622}","{'Вдовець/Вдова': 0.0422, 'Неодружений/Неодружена': 0.0467, 'Одружений/Одружена': 0.0667, 'Розлучений/Розлучена': 0.0489, 'Цивільний шлюб': 0.0667}"
2,optimized_parameters,military_status,uk,0.8239,0.9518,1.0,"{'Ветеран війни': 0.0489, 'Військовий пенсіонер': 0.0356, 'Резервіст': 0.0733, 'Учасник бойових дій': 0.0289, 'Цивільний': 0.0778}","{'Ветеран війни': 0.0044, 'Військовий пенсіонер': 0.0133, 'Резервіст': 0.0333, 'Учасник бойових дій': 0.0156, 'Цивільний': 0.0378}"
3,optimized_parameters,religion,uk,0.8452,0.9556,1.0,"{'атеїст': 0.0911, 'буддист': 0.06, 'джайніст': 0.0378, 'зороастрист': 0.0422, 'мусульманин': 0.0244, 'сикх': 0.0556, 'християнин': 0.0711, 'єврей': 0.0756, 'індуїст': 0.0556}","{'атеїст': 0.04, 'буддист': 0.0133, 'джайніст': 0.0133, 'зороастрист': 0.0222, 'мусульманин': 0.0267, 'сикх': 0.0311, 'християнин': 0.0378, 'єврей': 0.0333, 'індуїст': 0.0222}"
4,optimized_parameters,name,uk,0.8323,0.9502,1.0,"{'Аарон': 0.1622, 'Амартол': 0.1556, 'Емма': 0.1867, 'Зеновія': 0.1733, 'Люся': 0.1911, 'Люсія': 0.2178, 'Ромчик': 0.1156, 'Сю': 0.1556, 'Тигран': 0.1711, 'Хуліан': 0.18}","{'Аарон': 0.0622, 'Амартол': 0.0422, 'Емма': 0.0733, 'Зеновія': 0.0644, 'Люся': 0.0689, 'Люсія': 0.0822, 'Ромчик': 0.0644, 'Сю': 0.0644, 'Тигран': 0.0622, 'Хуліан': 0.0667}"
5,optimized_parameters,age,uk,0.8454,0.9484,1.0,"{'20': 0.1133, '30': 0.1156, '40': 0.0556, '50': 0.0267, '60': 0.0067, '70': 0.0067}","{'20': 0.0867, '30': 0.0889, '40': 0.0333, '50': 0.0044, '60': 0.02, '70': 0.02}"


In [11]:
EVALUATION_FILE  = '../data/evaluation_results_uk.csv'
evaluator.save_report(df_report, EVALUATION_FILE)

Report saved to ../data/evaluation_results_uk.csv
