In [None]:
data_dir = "/home/omadbek/projects/Sherlock/custom_data"

In [13]:
import os
import json
import pandas as pd
import shutil

In [1]:
## Test - table statistics
def create_finetuning_json(
    labels_path: str,
    data_path: str,
    label_set: dict,
    output_json_path: str,
    sample_size: int = 5,
    table_src: bool = False
):
    """
    Reads a labels.parquet and data.parquet, merges them by column index,
    then constructs a list of { "instruction": ..., "input": ..., "output": ... } dicts 
    for LLM fine-tuning. Writes that list to output_json_path as a single JSON array.
    Now also computes: std, average, mode, median, max, min (rounded per spec).

    Rules for statistics:
    - If all sampled values are numeric, compute stats on those numeric values.
    - Otherwise, compute stats on the lengths of the sampled values.
    - Round floating‐point stats to two decimal places. If a stat is an integer, show it without decimals.
    """

    def _format_stat(val: float) -> str:
        """
        Round floats to two decimals. If the float is integer‐equivalent, return as int string.
        """
        if isinstance(val, float) and val.is_integer():
            return str(int(val))
        return f"{val:.2f}"

    # 1) Load both Parquet files and reset_index so the column index becomes a real column
    labels_df = pd.read_parquet(labels_path).reset_index()
    data_df   = pd.read_parquet(data_path).reset_index()

    # 2) Detect which column holds the index in each DataFrame
    if "__index_level_0__" in labels_df.columns:
        labels_idx_col = "__index_level_0__"
    else:
        labels_idx_col = "index"

    if "__index_level_0__" in data_df.columns:
        data_idx_col = "__index_level_0__"
    else:
        data_idx_col = "index"

    # 3) Rename those columns for clarity: index → col_idx, type → label
    labels_df = labels_df.rename(columns={labels_idx_col: "col_idx", "type": "label"})
    data_df   = data_df.rename(columns={data_idx_col: "col_idx", "values": "values"})

    # 4) Drop any rows with label "__none__"
    labels_df = labels_df[labels_df["label"] != "__none__"]

    # 5) Merge on col_idx (inner join)
    merged = pd.merge(labels_df, data_df, on="col_idx", how="inner")

    # 6) Prepare containers
    instances = []
    all_labels_list = label_set["label_set"]  # e.g. ["age","gender","date",...]

    # 7) Iterate over every row (no tqdm)
    for idx, row in enumerate(merged.itertuples(index=False)):
        orig_label = row.label
        raw_vals   = row.values

        # 7a) Parse raw_vals (string or list)
        if isinstance(raw_vals, str):
            vals = raw_vals.split(",")
        else:
            vals = list(raw_vals)

        # 7b) Convert all vals to strings (strip whitespace), deduplicate & sample up to sample_size
        vals = [str(x).strip() for x in vals]
        unique_vals = pd.unique(vals)
        sampled = unique_vals.tolist()[:sample_size]

        # 7c) Optionally prepend “source” context if you have insert_source()
        if table_src:
            # e.g. context = insert_source(sampled, str(row.col_idx))
            context = sampled
        else:
            context = sampled

        # --- NEW: decide whether to compute stats on numeric values or on lengths ---
        # Try converting each sampled value to float
        numeric_conversion = []
        for x in context:
            try:
                numeric_conversion.append(float(x))
            except ValueError:
                numeric_conversion = []
                break  # if any conversion fails, we will switch to lengths

        if len(numeric_conversion) == len(context):
            # All sampled values are numeric
            stats_series = pd.Series(numeric_conversion)
        else:
            # At least one sampled value is non‐numeric → compute on string lengths
            length_list = [len(x) for x in context]
            stats_series = pd.Series(length_list)

        # Compute required summary statistics
        std_val    = stats_series.std()     # sample standard deviation (ddof=1)
        avg_val    = stats_series.mean()
        median_val = stats_series.median()
        mode_vals  = stats_series.mode()
        mode_val   = mode_vals.iloc[0] if not mode_vals.empty else stats_series.iloc[0]
        max_val    = stats_series.max()
        min_val    = stats_series.min()

        # Format each statistic per spec
        fmt_std    = _format_stat(std_val)
        fmt_avg    = _format_stat(avg_val)
        fmt_mode   = _format_stat(mode_val)
        fmt_median = _format_stat(median_val)
        fmt_max    = _format_stat(max_val)
        fmt_min    = _format_stat(min_val)

        stats_lines = (
            f"std: {fmt_std}, "
            f"average: {fmt_avg}, "
            f"mode: {fmt_mode}, "
            f"median: {fmt_median}, "
            f"max: {fmt_max}, "
            f"min: {fmt_min}"
        )

        # 8) Build the “OPTIONS” sub-block (each label on its own line, prefixed by "  - ")
        options_block = ""
        for lbl in all_labels_list:
            options_block += f"  - {lbl}\n"

        # 9) Build the full 'input' field as one multi-line string:
        #    We include:
        #      "<context>.\n"
        #      "<stats_lines>\n"
        #      "OPTIONS:\n"
        #      "<options_block>"
        input_text = f"{context}\n"
        input_text += f"{stats_lines}]\n"
        input_text += "CATEGORY:\n"
        #input_text += options_block

        # 10) Build the dict with lowercase keys "instruction", "input", and "output"
        inst = {
            "instruction": "Select the category which best matches the input.",
            "input": input_text,
            "output": f"{orig_label}\n"
        }
        instances.append(inst)

    # 11) Ensure the output folder exists, then dump the entire list as a single JSON array
    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(instances, f, ensure_ascii=False, indent=2)

    print(f"[✓] Wrote {len(instances)} instances to {output_json_path}")


In [2]:
# Example usage for your training set:
if __name__ == "__main__":

    output_path = "./fine_tuned_data"

    if os.path.isdir(output_path):
        shutil.rmtree(output_path)

    os.makedirs(output_path)

    # Replace the paths below with your real filepaths:
    train_labels_file = f"{data_dir}/raw/train_labels.parquet"
    train_data_file   = f"{data_dir}/raw/train_data.parquet"
    val_labels_file   = f"{data_dir}/raw/validation_labels.parquet"
    val_data_file     = f"{data_dir}/raw/validation_data.parquet"

    # Your label_set dict, containing at least "label_set":
    label_set = {
        "name": "WHO CTA Labels",
        "label_set": ['age', 'case_status', 'contact_setting', 'date', 'gender', 'id',
       'location', 'medical_boolean', 'occupation', 'outcome', 'symptoms']
    }

    
    # Create train.json:
    create_finetuning_json(
        labels_path=train_labels_file,
        data_path=train_data_file,
        label_set=label_set,
        output_json_path=f"{output_path}/train_finetune.json",
        sample_size=5,
        table_src=False
    )
    

    # Create val.json:
    create_finetuning_json(
        labels_path=val_labels_file,
        data_path=val_data_file,
        label_set=label_set,
        output_json_path=f"{output_path}/val_finetune.json",
        sample_size=5,
        table_src=False
    )
    

[✓] Wrote 433 instances to output/train_finetune.json
[✓] Wrote 54 instances to output/val_finetune.json


  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique_vals = pd.unique(vals)
  unique

In [3]:
# paths to your fine-tuned JSONs
train_json = f"{output_path}/train_finetune.json"
val_json   = f"{output_path}/val_finetune.json"
full_json  = f"{output_path}/full_train_finetune.json"

# 1) Load
with open(train_json, "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open(val_json, "r", encoding="utf-8") as f:
    val_data = json.load(f)

# 2) Merge (assuming each is a top-level list)
full_data = train_data + val_data

# 3) Write out
os.makedirs(os.path.dirname(full_json), exist_ok=True)
with open(full_json, "w", encoding="utf-8") as f:
    json.dump(full_data, f, ensure_ascii=False, indent=2)

print(f"Merged {len(train_data)} train + {len(val_data)} val → {len(full_data)} total instances")

Merged 433 train + 54 val → 487 total instances
