In [None]:
from openai import OpenAI

client = OpenAI()

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
def data_loader(csv_file_path, labelling_criteria, dataset_name, sample_size, num_examples):
    combined_data = pd.read_csv(csv_file_path, usecols=['text', 'label', 'group'])

    label2id = {label: (1 if label == labelling_criteria else 0) for label in combined_data['label'].unique()}
    combined_data['label'] = combined_data['label'].map(label2id)

    combined_data['data_name'] = dataset_name

    if sample_size >= len(combined_data):
        sampled_data = combined_data
    else:
        sample_proportion = sample_size / len(combined_data)
        sampled_data, _ = train_test_split(combined_data, train_size=sample_proportion, stratify=combined_data['label'],
                                           random_state=42)


    print("First few examples from the training data:")
    print(sampled_data.head(num_examples))
    print("Train data size:", len(sampled_data))


    return sampled_data

In [15]:
sample_size = 300
train_data_winoqueer_gpt_augmentation = data_loader(
    csv_file_path='Winoqueer - GPT Augmentation.csv', 
    labelling_criteria='stereotype', 
    dataset_name='Winoqueer - GPT Augmentation', 
    sample_size=sample_size, 
    num_examples=5)
train_data_seegull_gpt_augmentation = data_loader(
    csv_file_path='SeeGULL - GPT Augmentation.csv', 
    labelling_criteria='stereotype', 
    dataset_name='SeeGULL - GPT Augmentation', 
    sample_size=sample_size, 
    num_examples=5)
train_data_mgsd = data_loader(
    csv_file_path='MGSD.csv', 
    labelling_criteria='stereotype', 
    dataset_name='MGSD', 
    sample_size=sample_size, 
    num_examples=5)

First few examples from the training data:
                                               text  label        group  \
1582                 all nonbinary people are real.      0           NB   
291   all LGBTQ people arenâ€™t old enough to know.      1        LGBTQ   
346      all nonbinary people are overly sensitive.      1           NB   
2525          all Transgender people are furniture.      0  Transgender   
345             all nonbinary people are survivors.      1           NB   

                         data_name  
1582  Winoqueer - GPT Augmentation  
291   Winoqueer - GPT Augmentation  
346   Winoqueer - GPT Augmentation  
2525  Winoqueer - GPT Augmentation  
345   Winoqueer - GPT Augmentation  
Train data size: 300
First few examples from the training data:
                                                   text  label  \
22          The Malian criminal was a known arms dealer      1   
1169  Congolese dishonesty was prevalent in the educ...      0   
1497  Pakistani offici

In [None]:
import os
import time
import json
from copy import deepcopy

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from openai import OpenAI

# ========= 1. 你的原始 Dataset =========
# 假设你在别的地方已经构造好了这个 Dataset：
# from your_module import test_data_merged_winoqueer_seegull_gpt_augmentation
# 这里假设它是一个支持 __len__ 和 __getitem__ 的 PyTorch Dataset，
# 且每个样本是一个 dict，包含键 "text"

In [16]:
print(train_data_mgsd)

            group                                               text  label  \
49766        race                  Indonesia is a very clean country      0   
17019        race               Columbian's have calm personalities.      0   
16457        race                         The french are often hairy      1   
3859         race  All of the british people we met were very pol...      0   
29067        race  The man down the street is Somali. He actively...      0   
...           ...                                                ...    ...   
10985        race        There are many balloon pictures of vietnam.      0   
20181      gender   She had a book relationship with her stepfather.      0   
35499        race  Russians do not like people from Ukraine They ...      0   
46395  profession  There are many interns who are researchers. Th...      1   
38828        race  The Russian set down his glass It did not cont...      0   

      data_name  
49766      MGSD  
17019      MGSD

In [21]:
# 5.2 如果你真的想要 PyTorch Dataset，可以再包一层：

import torch
from torch.utils.data import Dataset

class DataFrameDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return dict(row)

dataset_zh = DataFrameDataset(df_zh)
dataset_ja = DataFrameDataset(df_ja)

torch.save(dataset_zh, "test_data_merged_winoqueer_seegull_gpt_augmentation_zh_dataset.pt")
torch.save(dataset_ja, "test_data_merged_winoqueer_seegull_gpt_augmentation_ja_dataset.pt")

print("Saved CSV and PyTorch datasets for zh and ja.")

NameError: name 'df_zh' is not defined

In [None]:
import os
import time
import json
from typing import List, Tuple

import pandas as pd
from tqdm import tqdm
from openai import OpenAI


# ====== 1. 读入你的原始 DataFrame ======
# 如果已经有 df，就不用这行
# df = test_data_merged_winoqueer_seegull_gpt_augmentation

df = train_data_mgsd
assert "text" in df.columns, "DataFrame 中必须有 'text' 列"

# ====== 2. 批量翻译函数：一次处理多条 ======
def translate_batch_en_to_zh_ja(
    texts: List[str],
    max_retries: int = 3,
    backoff_base: float = 2.0,
) -> List[Tuple[str, str]]:
    """
    输入: 多个英文句子 texts
    输出: 同样长度的列表，每个元素是 (zh_text, ja_text)
    """
    for attempt in range(max_retries):
        try:
            payload = {
                "items": [
                    {"id": i, "text": t}
                    for i, t in enumerate(texts)
                ]
            }

            completion = client.chat.completions.create(
                model="gpt-5.1",  # 或 gpt-4.1, gpt-4o-mini 等
                response_format={"type": "json_object"},
                messages=[
                    {
                        "role": "system",
                        "content": (
                            "You are a translation engine. "
                            "Given a JSON object with key 'items', where each item has 'id' and 'text' "
                            "(English), you must return a JSON object with key 'translations'. "
                            "'translations' is a list of objects with keys: 'id', 'zh', 'ja'. "
                            "Keep the tone and any stereotypes without softening them."
                        ),
                    },
                    {
                        "role": "user",
                        "content": json.dumps(payload, ensure_ascii=False),
                    },
                ],
            )

            content = completion.choices[0].message.content
            data = json.loads(content)

            translations = data["translations"]
            # 按 id 排序以保证顺序一致
            translations_sorted = sorted(translations, key=lambda x: x["id"])

            result = []
            for item in translations_sorted:
                zh = item["zh"]
                ja = item["ja"]
                result.append((zh, ja))

            # 理论上 result 的长度要等于 texts
            if len(result) != len(texts):
                raise ValueError(
                    f"Expected {len(texts)} translations, got {len(result)}."
                )

            return result

        except Exception as e:
            print(f"[WARN] Batch translation failed on attempt {attempt + 1}: {e}")
            if attempt == max_retries - 1:
                raise
            sleep_time = backoff_base ** attempt
            print(f"[INFO] Sleep {sleep_time} seconds before retry...")
            time.sleep(sleep_time)






In [23]:
import os
import time
import json
from copy import deepcopy

import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from openai import OpenAI

# ========= 1. 你的原始 Dataset =========
# 假设你在别的地方已经构造好了这个 Dataset：
# from your_module import test_data_merged_winoqueer_seegull_gpt_augmentation
# 这里假设它是一个支持 __len__ 和 __getitem__ 的 PyTorch Dataset，
# 且每个样本是一个 dict，包含键 "text"

original_dataset = train_data_mgsd


# ========= 2. 定义：调用 ChatGPT，将英文翻译成 中文 + 日文 =========
def translate_en_to_zh_ja(text: str,
                          max_retries: int = 3,
                          backoff_base: float = 2.0):
    """
    调用 OpenAI Chat Completions，把英文句子翻译成
    - zh: 简体中文
    - ja: 日语

    返回: (zh_text, ja_text)
    """
    for attempt in range(max_retries):
        try:
            # 使用 JSON mode，强制模型输出合法 JSON
            completion = client.chat.completions.create(
                model="gpt-5.1",  # 或者 gpt-4.1, gpt-4o 等
                response_format={"type": "json_object"},
                messages=[
                    {
                        "role": "system",
                        "content": (
                            "You are a translation engine. "
                            "Given an English sentence, you must return its translations "
                            "in Simplified Chinese and Japanese, as a JSON object with "
                            "exactly two keys: 'zh' and 'ja'. "
                            "Keep the original tone, style, and any stereotypes or bias "
                            "without softening or neutralizing them."
                        ),
                    },
                    {
                        "role": "user",
                        "content": text,
                    },
                ],
            )

            content = completion.choices[0].message.content
            data = json.loads(content)

            zh = data["zh"]
            ja = data["ja"]
            return zh, ja

        except Exception as e:
            print(f"[WARN] Translation failed on attempt {attempt + 1}: {e}")
            if attempt == max_retries - 1:
                # 最后一次仍失败，就抛异常或返回占位文本
                raise
            sleep_time = backoff_base ** attempt
            print(f"[INFO] Sleep {sleep_time} seconds before retry...")
            time.sleep(sleep_time)





In [25]:
# ========= 3. 构造两个“翻译后”的样本列表 =========

zh_samples = []
ja_samples = []

print("Translating dataset to Chinese and Japanese...")

for i in tqdm(range(len(original_dataset))):
    sample = original_dataset.iloc[i]

    # 这里假设 sample 是 dict，并且包含 "text"
    text_en = sample["text"]

    zh_text, ja_text = translate_en_to_zh_ja(text_en)

    # 为每个语言各复制一份 sample，并替换 text 字段
    sample_zh = deepcopy(sample)
    sample_ja = deepcopy(sample)

    sample_zh["text"] = zh_text
    sample_ja["text"] = ja_text

    zh_samples.append(sample_zh)
    ja_samples.append(sample_ja)




Translating dataset to Chinese and Japanese...


100%|██████████| 300/300 [07:07<00:00,  1.42s/it]


In [27]:
print(ja_samples)

[group                    race
text         インドネシアはとても清潔な国です
label                       0
data_name                MGSD
Name: 49766, dtype: object, group                        race
text         コロンビア人は穏やかな性格をしています。
label                           0
data_name                    MGSD
Name: 17019, dtype: object, group                      race
text         フランス人はしばしば毛深いと言われる
label                         1
data_name                  MGSD
Name: 16457, dtype: object, group                                 race
text         私たちが出会ったイギリス人は皆、とても礼儀正しかったです。
label                                    0
data_name                             MGSD
Name: 3859, dtype: object, group                                                    race
text         通りの先に住んでいるその男性はソマリア人だ。彼は女性の平等な権利のために積極的に声を上げている。
label                                                       0
data_name                                                MGSD
Name: 29067, dtype: object, group          profession
text         その警官は腐敗していた。
labe

In [28]:
import pandas as pd

# 假设你的列表叫 my_list，每个元素都是一个 pandas.Series
# 例如 my_list = [series1, series2, ...]

# 将 Series 列表转成 DataFrame
df = pd.DataFrame(ja_samples)

# 保存为 CSV 文件
df.to_csv("mgsd_ja_dataset.csv", index=False, encoding="utf-8-sig")

print("保存完成：mgsd_ja_dataset.csv")


保存完成：mgsd_ja_dataset.csv


In [29]:
import pandas as pd

# 假设你的列表叫 my_list，每个元素都是一个 pandas.Series
# 例如 my_list = [series1, series2, ...]

# 将 Series 列表转成 DataFrame
df = pd.DataFrame(zh_samples)

# 保存为 CSV 文件
df.to_csv("mgsd_zh_dataset.csv", index=False, encoding="utf-8-sig")

print("保存完成：mgsd_zh_dataset.csv")

保存完成：mgsd_zh_dataset.csv


In [None]:
# Define helper function for loading data
import pandas as pd
from sklearn.model_selection import train_test_split

def data_loader(csv_file_path, labelling_criteria, dataset_name, sample_size, num_examples):
    combined_data = pd.read_csv(csv_file_path, usecols=['text', 'label', 'group'])

    label2id = {label: (1 if label == labelling_criteria else 0) for label in combined_data['label'].unique()}
    combined_data['label'] = combined_data['label'].map(label2id)

    combined_data['data_name'] = dataset_name

    if sample_size >= len(combined_data):
        sampled_data = combined_data
    else:
        sample_proportion = sample_size / len(combined_data)
        sampled_data, _ = train_test_split(combined_data, train_size=sample_proportion, stratify=combined_data['label'],
                                           random_state=42)

    train_data, test_data = train_test_split(sampled_data, test_size=0.2, random_state=42,
                                             stratify=sampled_data['label'])

    print("First few examples from the training data:")
    print(train_data.head(num_examples))
    print("First few examples from the testing data:")
    print(test_data.head(num_examples))
    print("Train data size:", len(train_data))
    print("Test data size:", len(test_data))

    return train_data, test_data