# Togo Active Learning Experiment Runner

This notebook manages the execution of sampling experiments on the Togo soil fertility dataset.

Author: Livia Betti  
Date: July 2025

### To run this notebook, the following tasks should be completed:
1. Generate relevant groups in Togo. I have generated group assignments based on regions, but if there are other representative groups that might be useful, we can generate those as well.
2. Make initial samples representing Cluster Sampling and Convenience Sampling.
3. Assign (distance-based) costs for the Convenience sampling setting
4. (Optional) Run regressions on the initial sample (with no augmentation) --> this will give a better understanding as to what initial samples are useful to augment (ideally, initial samples that yield a small, positive R2 score). If this step is skipped, an initial sample should just be chosen by the user.

## Imports

In [34]:
import os
import sys

RUN_DIR = "/home/libe2152/optimizedsampling/3_sampling/tools"

PROJECT_ROOT = os.path.abspath(os.path.join(RUN_DIR, ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

#change current working directory to tools/
os.chdir(RUN_DIR)

print("CWD now:", os.getcwd())
print("sys.path[0]:", sys.path[0])


CWD now: /home/libe2152/optimizedsampling/3_sampling/tools
sys.path[0]: /home/libe2152/optimizedsampling/3_sampling


## Core config values

In [None]:
from pathlib import Path
import subprocess

# === Shared Config ===
base_dir = Path("/home/libe2152/optimizedsampling")
script = base_dir / "train.py"
cfg = base_dir / "3_sampling/configs/togo/RIDGE.yaml"

sim_matrix_path = base_dir / "0_data/cosine_similarity/togo/cosine_similarity_train_test.npz"
dist_matrix_path = "" 

group_path = base_dir / "0_data/groups/togo/region_assignment.pkl"
group_type = "regions"

logfile = "completed_experiments_togo.log"
failed_log = "failed_experiments_togo.log"

seeds = [1, 42, 123, 456, 789, 1234, 5678, 9101, 1213, 1415]
methods = ["random", "greedycost", "poprisk", "similarity"]
budgets = [500, 1000, 5000, 10000]

init_name = "empty_initial_set"
dataset = "togo"


In [None]:
def is_completed(exp_name):
    try:
        with open(logfile) as f:
            return exp_name.strip() in {line.strip() for line in f}
    except FileNotFoundError:
        return False

def log_result(exp_name, success):
    log_path = logfile if success else failed_log
    with open(log_path, "a") as f:
        f.write(f"{exp_name}\n")


## Cost related arguments

In [36]:
cost_func = "cluster_based"
cost_name = "region_aware_unit_cost"

#optional: these are specific to region-aware cost
unit_assignment_path = "/home/libe2152/optimizedsampling/0_data/groups/togo/ea_assignments_dict.pkl"
unit_type = "cluster"
points_per_unit = 10

region_assignment_path = "/home/libe2152/optimizedsampling/0_data/groups/togo/prefecture_assignment.pkl"
in_region_unit_cost = 10
out_of_region_unit_cost = 15


## Method Related arguments

In [None]:
def append_method_flags(method):
    args = []
    if method == "similarity":
        args += [f"--similarity_matrix_path {sim_matrix_path}"]
    elif method == "diversity":
        args += [f"--distance_matrix_path {sim_matrix_path}"]

    if method == "poprisk":
        args += [
            "--util_lambda 0.1",
            f"--group_assignment_path {group_path}",
            f"--group_type {group_type}"
        ]
    elif method == "match_population_proportion":
        args += [
            f"--group_assignment_path {group_path}",
            f"--group_type {group_type}"
        ]
    return " ".join(args)


## Run

In [None]:
def run_setting1(seed):
    for cost_fn in ["uniform", "convenience_based"]:
        print(f"Running with COST_FN={cost_fn}, SEED={seed}")

        for method in methods:
            for budget in budgets:
                cost_func = "uniform"
                cost_name = cost_fn
                util_lambda = 0.5  # default

                unit_assignment_path = ""
                unit_type = ""
                points_per_unit = ""
                region_assignment_path = ""
                in_region_unit_cost = ""
                out_of_region_unit_cost = ""
                cost_array_path = ""

                if cost_fn == "convenience_based":
                    cost_func = "pointwise_by_array"
                    cost_array_path = base_dir / "0_data/costs/togo/convenience_costs/distance_based_costs_top1_urban.pkl"

                # === Build experiment name
                if method in ["poprisk", "match_population_proportion"]:
                    exp_name = f"{dataset}_{init_name}_cost_{cost_name}_method_{method}_{group_type}_budget_{budget}_seed_{seed}"
                else:
                    exp_name = f"{dataset}_{init_name}_cost_{cost_name}_method_{method}_budget_{budget}_seed_{seed}"

                if is_completed(exp_name):
                    print(f"Skipping already completed: {exp_name}")
                    continue

                cmd = f"""
python {script} \\
  --cfg {cfg} \\
  --exp-name {exp_name} \\
  --sampling_fn {method} \\
  --budget {budget} \\
  --initial_set_str {init_name} \\
  --id_path "" \\
  --seed {seed} \\
  --cost_func {cost_func} \\
  --cost_name {cost_name} \\
  --unit_assignment_path {unit_assignment_path} \\
  --unit_type {unit_type} \\
  --points_per_unit {points_per_unit} \\
  --region_assignment_path {region_assignment_path} \\
  --in_region_unit_cost {in_region_unit_cost} \\
  --out_of_region_unit_cost {out_of_region_unit_cost} \\
  --group_assignment_path "" \\
  --group_type {group_type} \\
  --util_lambda {util_lambda} \\
  {'--cost_array_path ' + str(cost_array_path) if cost_array_path else ""} \\
  {append_method_flags(method)}
"""
                print(f"▶️ Running: {exp_name}")
                ret = subprocess.call(cmd, shell=True)

                if ret != 0:
                    print(f"Failed: {exp_name}")
                    log_result(exp_name, success=False)
                else:
                    print(f"Completed: {exp_name}")
                    log_result(exp_name, success=True)


In [None]:
for seed in seeds:
    run_setting1(seed)