In [4]:
# --- Corrupt only a percentage of rows (e.g., 5%) ---
import os, json
import pandas as pd
import numpy as np
import random

# CONFIG
input_file = "sequence_of_sets_formatted_Won.csv"
output_tag = "medium"          # choose from: "light", "medium", "heavy" (your presets)
corruption_rate = 0.2        # 5% of rows will be corrupted
seed = 42                     # set for reproducibility

# Load data (expects df + parse function from previous cells)
df = pd.read_csv(input_file)
import ast
def parse_sequence_cell(cell: str):
    return list(ast.literal_eval(cell))
df["sequence_parsed"] = df["sequence"].apply(parse_sequence_cell)

# Build universes (from previous cells)
all_known = sorted({ei for seq in df["sequence_parsed"] for ei in seq})
def ei_index(ei: str) -> int:
    try:
        return int(''.join(ch for ch in ei if ch.isdigit()))
    except Exception:
        return -1
max_idx = max([ei_index(e) for e in all_known if ei_index(e) >= 0] + [0])
noise_pool = [f"e{i}" for i in range(max_idx+1, max_idx+11)]

# RNG wrapper (from previous cells)
class RNGWrapper(random.Random):
    def __init__(self, seed=None):
        super().__init__(seed)
        self._np_rng = np.random.default_rng(seed)
    def poisson(self, lam):
        return int(self._np_rng.poisson(lam))

# Corruption ops (assumes you already defined these earlier)
def random_swap(seq, rng, num_swaps=1):
    seq = seq.copy()
    if len(seq) < 2: return seq
    for _ in range(num_swaps):
        i, j = rng.randrange(len(seq)), rng.randrange(len(seq))
        if i != j:
            seq[i], seq[j] = seq[j], seq[i]
    return seq

def random_deletions(seq, rng, p_delete_each=0.1):
    return [x for x in seq if rng.random() > p_delete_each]

def random_replacements(seq, rng, p_replace_each=0.1, universe=None):
    if not universe: return seq
    out = []
    for x in seq:
        if rng.random() < p_replace_each:
            candidates = [u for u in universe if u != x]
            out.append(rng.choice(candidates) if candidates else x)
        else:
            out.append(x)
    return out

def random_insertions(seq, rng, k_inserts=1, insert_pool=None):
    if not insert_pool: return seq
    seq = seq.copy()
    for _ in range(k_inserts):
        ins = rng.choice(insert_pool)
        pos = rng.randrange(len(seq)+1)
        seq.insert(pos, ins)
    return seq

def corrupt_sequence(seq, rng, params, known_universe, noise_pool):
    out = random_deletions(seq, rng, params["p_delete_each"])
    out = random_replacements(out, rng, params["p_replace_each"], known_universe)
    out = random_swap(out, rng, rng.poisson(params["swap_lambda"]))
    out = random_insertions(out, rng, rng.poisson(params["add_lambda_known"]), known_universe)
    out = random_insertions(out, rng, rng.poisson(params["add_lambda_noise"]), noise_pool)
    if params.get("as_set_then_sorted", False):
        out = sorted(set(out), key=lambda x: (ei_index(x), x))
    if params.get("ensure_nonempty", True) and not out:
        out = [rng.choice(seq) if seq else rng.choice(known_universe)]
    return out

# Presets (same as before; edit if needed)
presets = {
    "light":  {"p_delete_each":0.05,"p_replace_each":0.05,"swap_lambda":0.5,"add_lambda_known":0.3,"add_lambda_noise":0.1,"ensure_nonempty":True,"as_set_then_sorted":False},
    "medium": {"p_delete_each":0.15,"p_replace_each":0.15,"swap_lambda":1.5,"add_lambda_known":0.8,"add_lambda_noise":0.5,"ensure_nonempty":True,"as_set_then_sorted":False},
    "heavy":  {"p_delete_each":0.30,"p_replace_each":0.30,"swap_lambda":3.0,"add_lambda_known":1.5,"add_lambda_noise":1.2,"ensure_nonempty":True,"as_set_then_sorted":False},
}

# Select a random subset of rows to corrupt
rng = RNGWrapper(seed=seed)
n_total = len(df)
n_to_corrupt = int(round(corruption_rate * n_total))
to_corrupt_idx = set(df.sample(n=n_to_corrupt, random_state=seed).index)

# Apply corruption only to that subset
params = presets[output_tag]
out_rows = []
for idx, row in df.iterrows():
    original = row["sequence_parsed"]
    if idx in to_corrupt_idx:
        corrupted = corrupt_sequence(original, rng, params, all_known, noise_pool)
        is_corrupted = True
    else:
        corrupted = original[:]  # keep as-is
        is_corrupted = False
    out_rows.append({
        "episode_id": row["episode_id"],
        "original_sequence": json.dumps(original),
        "corrupted_sequence": json.dumps(corrupted),
        "is_corrupted": is_corrupted,
        "corruption_level": output_tag if is_corrupted else "none"
    })

out_df = pd.DataFrame(out_rows)

# Save to same folder as input
out_dir = os.path.dirname(os.path.abspath(input_file))
out_path = os.path.join(out_dir, f"corrupted_{output_tag}_{int(corruption_rate*100)}pct.csv")
out_df.to_csv(out_path, index=False)
print(f"Saved {n_to_corrupt}/{n_total} rows corrupted ({corruption_rate*100:.1f}%) -> {out_path}")


Saved 16/80 rows corrupted (20.0%) -> c:\Users\nimam\Desktop\desktop\floyds alg\game_data\dataProcessingV2\corrupted_medium_20pct.csv
