In [None]:
import pandas as pd

df = pd.read_excel('SIB-200 languages - ACL.xlsx')
filtered_languages = df[df['Bloom Train Data Percentage'] > 0]
filtered_languages

In [None]:
import os
import subprocess
import torch
import gc

# Set environment variables
output_directory = "output_sib200"
os.makedirs(output_directory, exist_ok=True)
MAX_LENGTH = "164"
BATCH_SIZE = "16"
NUM_EPOCHS = "10"
SAVE_STEPS = "500000"
MODEL = "bigscience/bloom-1b1"
MODEL_TYPE = "bloom"
dir_name = "sib-200/data/annotated"
SEED = "42"

# Iterate over directories in dir_name
for SRC_LANG_DIR in filtered_languages['Folder Name'][:1]:
    gc.collect()
    torch.cuda.empty_cache()
    # Extract SRC_LANG from directory name
    SRC_LANG = os.path.basename(SRC_LANG_DIR)
    print(SRC_LANG)

    OUTPUT_FILE = f"test_result_{SRC_LANG}"
    OUTPUT_PREDICTION = f"test_predictions_{SRC_LANG}"
    SRC_DATA_DIR = os.path.join(dir_name, SRC_LANG)
    OUTPUT_DIR = f"{output_directory}/{MODEL}/{SRC_LANG}_bert"

    subprocess.run([
        'python', 'sib-200/code/train_textclass.py',
        "--data_dir", SRC_DATA_DIR,
        "--model_type", MODEL_TYPE,
        "--model_name_or_path", MODEL,
        "--output_dir", OUTPUT_DIR,
        "--output_result", OUTPUT_FILE,
        "--output_prediction_file", OUTPUT_PREDICTION,
        "--max_seq_length", MAX_LENGTH,
        "--num_train_epochs", NUM_EPOCHS,
        "--learning_rate", "1e-5",
        "--per_gpu_train_batch_size", BATCH_SIZE,
        "--per_gpu_eval_batch_size", BATCH_SIZE,
        "--save_steps", SAVE_STEPS,
        "--seed", SEED,
        "--gradient_accumulation_steps", "2",
        "--labels", os.path.join(SRC_DATA_DIR, 'labels.txt'),
        "--do_train",
        "--do_eval",
        "--do_predict",
        "--overwrite_output_dir"
    ])

    # Remove unnecessary files
    files_to_remove = [
        "pytorch_model.bin",
        "sentencepiece.bpe.model",
        "tokenizer.json",
        "tokenizer_config.json",
        "config.json",
        "training_args.bin",
        "special_tokens_map.json",
        "sentencepiece.model",
    ]

    for file_name in files_to_remove:
        file_path = os.path.join(OUTPUT_DIR, file_name)
        if os.path.exists(file_path):
            os.remove(file_path)