### Import package

In [1]:
import json
import random
import pandas as pd
from collections import defaultdict
import shutil
import os

### Define the parse function dealing with json format

In [None]:
def clean_and_parse_json(raw):
    try:
        if pd.isna(raw) or raw.strip() in ['""', "''", '[]', '']:
            return []
        
        cleaned = raw.strip()

        #delete markdown format
        if cleaned.startswith("```json"):
            cleaned = cleaned[len("```json"):].strip()
        if cleaned.startswith("```"):
            cleaned = cleaned[len("```"):].strip()
        if cleaned.endswith("```"):
            cleaned = cleaned[:-3].strip()

        return json.loads(cleaned)
    except Exception:
        return None

### Parse the replys from the distilled data

In [None]:
# === Load distilled data CSV ===
file_path = "gpt_reply_log.csv"

# === Read CSV and parse 'reply' column as JSON ===
df = pd.read_csv(file_path)
df["reply_json"] = df["reply"].apply(clean_and_parse_json)

# === Basic statistics ===
total = len(df)
success = df["reply_json"].notnull().sum()
failure = df["reply_json"].isnull().sum()
failed_indices = df[df["reply_json"].isnull()].index.tolist()

print(f"successfully parse: {success}")
print(f"failly parse: {failure}")
print(f"Failed indices (first 10): {failed_indices[:10]}")

# === Extract successfully parsed entries ===
df_success = df[df["reply_json"].notnull()]
success_data = df_success["reply_json"].tolist()

# Save them into JSON format
with open("success.json", "w", encoding="utf-8") as f:
    json.dump(success_data, f, indent=2, ensure_ascii=False)

# # === Extract failed entries for manual debugging ===
# df_failure = df[df["reply_json"].isnull()]
# failure_data = df_failure["reply"].tolist()

# # # Save them into a text file for manual inspection
# with open("failure.txt", "w", encoding="utf-8") as f:
#     for entry in failure_data:
#         f.write(str(entry) + "\n\n---\n\n")


### Split train/val dataset depend on sub-image groups

In [2]:
# 1. Load the parsed QA dataset from success.json
with open("success.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. Group QA pairs by sub_image (one entry per image)
image_to_qas = {}
for qa_list in data:
    if not isinstance(qa_list, list):
        continue
    for qa in qa_list:
        image = qa.get("sub_image", "").strip()
        if image:
            image_to_qas.setdefault(image, []).append(qa)

# 3. Set a fixed random seed for reproducibility, and select 15% of images for validation
random.seed(42)
all_images = list(image_to_qas.keys())
val_size = max(1, int(0.15 * len(all_images)))
val_images = set(random.sample(all_images, val_size))

# 4. Split QA pairs into training and validation sets based on image grouping
train_data, val_data = [], []
for image, qas in image_to_qas.items():
    if image in val_images:
        val_data.append(qas)
    else:
        train_data.append(qas)

# 5. Save the training and validation sets to JSON files
with open("train_set.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, indent=2, ensure_ascii=False)

with open("val_set.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, indent=2, ensure_ascii=False)

print(f"✅ Split complete: train = {len(train_data)} images, val = {len(val_data)} images")


✅ Split complete: train = 33813 images, val = 5966 images


In [5]:
# 1. Load the training data (nested QA format grouped by image)
with open("val_set.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)


# 2. Initialize container for the flattened results
flattened_data = []

# 3. Flatten each QA pair into an independent LLaVA-style sample
for qa_list in raw_data:
    if not isinstance(qa_list, list):
        continue
    for qa in qa_list:
        image_path = qa.get("sub_image", "").strip()
        question = qa.get("question", "").strip()
        answer = qa.get("answer", "").strip()
        level = qa.get("level", None)

        flattened_data.append({
            "image": image_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n {question}",
                    "level": level
                },
                {
                    "from": "gpt",
                    "value": answer,
                    "level": level
                }
            ]
        })

# 4. Save the result in LLaVA-compatible format (flattened per QA pair)
with open("val_llava_flat.json", "w", encoding="utf-8") as f:
    json.dump(flattened_data, f, indent=2, ensure_ascii=False)

print("✅ Converted and saved to val_llava_flat.json. Total samples:", len(flattened_data))


✅ Converted and saved to val_llava_flat.json. Total samples: 38103


In [6]:
# === Configuration ===
input_path = "val_llava_flat.json"
output_prefix = "val_"

# Difficulty level mapping from numeric level to string labe
level_mapping = {1: "Classification", 2: "Recognition", 3: "Reasoning", 4: "Summary"}

# === Step 1: Load the original dataset ===
with open(input_path, "r") as f:    data = json.load(f)

# === Step 2: Organize data by different types and reformat ===
standard_by_level = defaultdict(list)
id_counters = defaultdict(int)

for item in data:
    try:
        # Extract the level number and convert it to a difficulty label
        level_num = item["conversations"][0].get("level")
        level_str = level_mapping.get(level_num)
        if not level_str:
            continue

        # Extract the image filename (no path)
        image_name = item["image"].split("/")[-1]

        # Reformat the item into standard format with a custom ID
        standard_item = {
            "id": f"{level_str.upper()}_{id_counters[level_str]:06d}",
            "image": image_name,
            "conversations": [
                {
                    "from": "human",
                    "value": item["conversations"][0]["value"]
                },
                {
                    "from": "gpt",
                    "value": item["conversations"][1]["value"]
                }
            ]
        }

        # Append to corresponding level list
        standard_by_level[level_str].append(standard_item)
        id_counters[level_str] += 1

    except Exception as e:
        print("Skipped problematic entry:", e)
        continue

# === Step 3: Save the reformatted data by difficulty level ===
for level_str, items in standard_by_level.items():
    output_path = f"val/{output_prefix}{level_str}.json"
    with open(output_path, "w") as f:
        json.dump(items, f, indent=2)
    print(f"[✓] Exported: {output_path} ({len(items)} samples)")


[✓] Exported: val/val_Classification.json (9131 samples)
[✓] Exported: val/val_Recognition.json (11652 samples)
[✓] Exported: val/val_Reasoning.json (11586 samples)
[✓] Exported: val/val_Summary.json (5685 samples)


In [None]:
# === Path Configuration ===
json_path = "success_filtered.json"  # Input JSON file with nested QA format
source_dir = "/home/ne6131039/Desktop/TEM_DATAS/LLaVA Dataset/TEM_images"
target_dir = "/home/ne6131039/Desktop/LLaVA_train"

# Create the output directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# === Step 1: Load success.json ===
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# === Step 2: Collect all unique image filenames from 'sub_image'
image_names = set()
for qa_list in data:
    if not isinstance(qa_list, list):
        continue
    for qa in qa_list:
        image = qa.get("sub_image", "").strip()
        filename = image.split("/")[-1]
        if filename:
            image_names.add(filename)

print(f"Total unique images to copy: {len(image_names)}")

# === Step 3: Copy images from source_dir to target_dir
missing = []
for img in image_names:
    src = os.path.join(source_dir, img)
    dst = os.path.join(target_dir, img)
    if os.path.exists(src):
        # shutil.copyfile(src, dst)
        continue
    else:
        missing.append(img)

# === Step 4: Summary
print(f"Copy completed: {len(image_names) - len(missing)} images copied successfully.")
if missing:
    print(f"Missing {len(missing)} images. Examples: {missing[:5]}")

In [None]:
# # === Step 4: Filter out QA groups with any missing sub_image inside ===
# missing_set = set(missing)
# filtered_data = []

# for group in data:
#     if not isinstance(group, list) or not group:
#         continue

#     # 確認整個 group 所有 sub_image 都不在 missing_set
#     group_images = {qa.get("sub_image", "").strip().split("/")[-1] for qa in group}
#     if not group_images & missing_set:
#         filtered_data.append(group)

# # === Step 5: Save filtered JSON ===
# with open("success_filtered.json", "w", encoding="utf-8") as f:
#     json.dump(filtered_data, f, indent=2, ensure_ascii=False)

# # === Final Summary ===
# print(f"Copy completed: {len(image_names) - len(missing)} images copied successfully.")
# if missing:
#     print(f"Missing {len(missing)} images. Examples: {missing[:5]}")
# print(f"Filtered JSON saved with {len(filtered_data)} image groups")