In [None]:
# データ量のカウント
import os
import pandas as pd
base_dir = "../Multilingual_safety_benchmark"
total_rows = 0
for subdir, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(subdir, file)
            print(file_path)
            try:
                df = pd.read_csv(file_path)
                row_count = len(df)
                total_rows += row_count
                print(f"Processed {file_path}: {row_count} rows")
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
print(f"Total rows across all CSV files: {total_rows}")


In [2]:
import pandas as pd
import os
# multilingual safety benchmark　から言語指定して300件抽出して保存
def create_balanced_dataset(dataset_dir, language, csv_names, total_samples, output_file):
    csv_dir = os.path.join(dataset_dir, language)
    samples_per_csv = total_samples // len(csv_names)
    remainder = total_samples % len(csv_names)

    new_dataset = pd.DataFrame(columns=["input"])

    for i, csv_name in enumerate(csv_names):
        csv_path = os.path.join(csv_dir, f"{csv_name}.csv")

        # CSVを読み込む
        if os.path.exists(csv_path):
            data = pd.read_csv(csv_path)
        else:
            raise FileNotFoundError(f"404 ??? {csv_path} not found")

        # input列だけを抽出
        if "input" not in data.columns:
            raise ValueError(f"The column 'input' does not exist in {csv_path}")

        sample_size = samples_per_csv + (1 if i < remainder else 0)
        sampled_data = data.sample(n=sample_size, random_state=42)

        # 必要な列だけにする
        sampled_data = sampled_data[["input"]]
        new_dataset = pd.concat([new_dataset, sampled_data], ignore_index=True)

    new_dataset.to_csv(output_file, index=False)
    print(f"File was saved to -> {output_file}")
    

# メイン処理
dataset_dir = "Multilingual_safety_benchmark"
csv_name_list = ["Unsafe_Instruction_Topic", "Crimes_And_Illegal_Activities", "insult"]
total_sample_count = 300
languages = ["en", "de", "hi", "ja","fr","ru","sp","zh"]

for language in languages:
    output_csv = f"Multilingual_{language}_{total_sample_count}.csv"
    create_balanced_dataset(dataset_dir, language, csv_name_list, total_sample_count, output_csv)


File was saved to -> dataset_for_sahara\Multilingual_en_300.csv
File was saved to -> dataset_for_sahara\Multilingual_de_300.csv
File was saved to -> dataset_for_sahara\Multilingual_hi_300.csv
File was saved to -> dataset_for_sahara\Multilingual_ja_300.csv
File was saved to -> dataset_for_sahara\Multilingual_fr_300.csv
File was saved to -> dataset_for_sahara\Multilingual_ru_300.csv
File was saved to -> dataset_for_sahara\Multilingual_sp_300.csv
File was saved to -> dataset_for_sahara\Multilingual_zh_300.csv


In [None]:
import pandas as pd
import os
# マルチリンガルデータの作成
def create_balanced_dataset(dataset_dir, language, csv_names, total_samples, output_file):
    csv_dir = os.path.join(dataset_dir, language)
    samples_per_csv = total_samples // len(csv_names)
    remainder = total_samples % len(csv_names)

    new_dataset = pd.DataFrame(columns=["input"])

    for i, csv_name in enumerate(csv_names):
        csv_path = os.path.join(csv_dir, f"{csv_name}.csv")

        # CSVを読み込む
        if os.path.exists(csv_path):
            data = pd.read_csv(csv_path)
        else:
            raise FileNotFoundError(f"404 ??? {csv_path} not found")

        # input列だけを抽出
        if "input" not in data.columns:
            raise ValueError(f"The column 'input' does not exist in {csv_path}")

        sample_size = samples_per_csv + (1 if i < remainder else 0)
        sampled_data = data.sample(n=sample_size, random_state=42)

        # 必要な列だけにする
        sampled_data = sampled_data[["input"]]
        new_dataset = pd.concat([new_dataset, sampled_data], ignore_index=True)

    new_dataset.to_csv(output_file, index=False)
    print(f"File was saved to -> {output_file}")
    

# メイン処理
dataset_dir = "Multilingual_safety_benchmark"
csv_name_list = ["Unsafe_Instruction_Topic", "Crimes_And_Illegal_Activities", "insult"]
total_sample_count = 300
languages = ["en", "de", "hi", "ja","fr","ru","sp","zh"]
output_dir = "dataset_for_sahara"
os.makedirs(output_dir, exist_ok=True)

for language in languages:
    output_csv = f"Multilingual_{language}_{total_sample_count}.csv"
    output_path = os.path.join(output_dir, output_csv)
    create_balanced_dataset(dataset_dir, language, csv_name_list, total_sample_count, output_path)


In [3]:
from datasets import load_dataset
import json

# squadをロードしてjsonに変換
def convert_dataset_to_json(dataset_name="squad", output_file="squad_dataset.json"):
    dataset = load_dataset(dataset_name)
    dataset_json = {}
    for split in dataset.keys():
        dataset_json[split] = [
            {
                "id": example["id"],
                "title": example["title"],
                "context": example["context"],
                "question": example["question"],
                "answers": example["answers"],
            }
            for example in dataset[split]
        ]
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset_json, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    convert_dataset_to_json()


データセットが 'dataset_for_sahara/squad_dataset.json' に保存されました。


In [7]:
import json
import csv
import random
# JSONファイルを読み込み、contextとquestionを結合してCSVに保存
def json_to_csv_with_input(json_file="dataset_for_sahara/squad_dataset.json", output_csv="squad_inputs.csv"):
    try:
        # JSONファイルを読み込み
        with open(json_file, "r", encoding="utf-8") as f:
            dataset = json.load(f)

        # CSVファイルに書き込み
        with open(output_csv, "w", encoding="utf-8", newline="") as csvfile:
            writer = csv.writer(csvfile)
            
            # ヘッダーを書き込み
            writer.writerow(["input"])

            # 各スプリット（train, validationなど）のデータを処理
            for split, examples in dataset.items():
                for example in examples:
                    context = example.get("context", "")
                    question = example.get("question", "")
                    input_text = context + " " + question
                    writer.writerow([input_text])

        print(f"CSVファイルが '{output_csv}' に保存されました。")
    except Exception as e:
        print(f"エラーが発生しました: {e}")
def create_random_sample_csv(input_csv="dataset_for_sahara/squad_inputs.csv", output_csv="dataset_for_sahara/squad300.csv", sample_size=300):
    try:
        # 入力CSVを読み込み
        with open(input_csv, "r", encoding="utf-8") as csvfile:
            reader = list(csv.reader(csvfile))
            header = reader[0]  # ヘッダーを取得
            data = reader[1:]   # データを取得

        # ランダムにサンプリング
        sample = random.sample(data, min(sample_size, len(data)))

        # サンプリング結果を新しいCSVに保存
        with open(output_csv, "w", encoding="utf-8", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(header)  # ヘッダーを書き込み
            writer.writerows(sample)  # サンプリングデータを書き込み

        print(f"ランダムサンプルが '{output_csv}' に保存されました。")
    except Exception as e:
        print(f"エラーが発生しました: {e}")

# メイン処理
if __name__ == "__main__":
    json_to_csv_with_input()
    create_random_sample_csv()

CSVファイルが 'squad_inputs.csv' に保存されました。
ランダムサンプルが 'dataset_for_sahara/squad300.csv' に保存されました。
