In [None]:
import tensorflow as tf
import pandas as pd
import os
import gdown
import zipfile
import random
from itertools import combinations
print(tf.__version__)
!python --version
print("TensorFlow version:", tf.__version__)
print(tf.config.list_physical_devices('GPU'))

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "Data_V9_ViT.zip", quiet=False)
with zipfile.ZipFile("Data_V9_ViT.zip", 'r') as zip_ref:
    zip_ref.extractall("")

## Parquet Creation Testing - Updated

In [None]:
BASE_PATH = "Data_V9_ViT/texture_uni"
random.seed(42)

writer_data = {}

for writer_id in os.listdir(BASE_PATH):
    writer_path = os.path.join(BASE_PATH, writer_id)
    if not os.path.isdir(writer_path):
        continue

    writer_data[writer_id] = {"N": [], "F": []}

    for sample in os.listdir(writer_path):
        # sample = e.g., S01_F
        sample_parts = sample.split("_")
        if len(sample_parts) != 2:
            continue
        speed = sample_parts[1]  # N or F

        sample_path = os.path.join(writer_path, sample)
        if not os.path.isdir(sample_path):
            continue

        for img in os.listdir(sample_path):
            if img.endswith(".pt"):
                img = img.replace(".pt", "")
                rel_path = os.path.join(BASE_PATH, writer_id, sample, img)
                writer_data[writer_id][speed].append(rel_path)

# Step 2: Split writers
all_writers = list(writer_data.keys())
train_writers = ['W001', 'W002', 'W003', 'W004', 'W005', 'W006', 'W007', 'W009', 'W010', 'W011', 'W012', 'W015', 'W017', 'W018', 'W019', 'W020', 'W021', 'W022', 'W023', 'W024', 'W025', 'W028', 'W029', 'W030', 'W031', 'W033', 'W034', 'W035', 'W038', 'W039', 'W040', 'W041', 'W042', 'W043', 'W044', 'W045', 'W046', 'W047', 'W049', 'W050', 'W051', 'W052', 'W053', 'W054', 'W058', 'W059', 'W062', 'W063', 'W067', 'W068', 'W069', 'W071', 'W073', 'W074', 'W076', 'W077', 'W081', 'W085', 'W086', 'W087', 'W088', 'W089', 'W090', 'W091', 'W092', 'W093', 'W095', 'W096', 'W097', 'W100']
test_writers = ['W013', 'W016', 'W027', 'W032', 'W048', 'W057', 'W064', 'W065', 'W078', 'W082', 'W083', 'W084', 'W094', 'W098', 'W101']
val_writers = ['W008', 'W014', 'W026', 'W036', 'W037', 'W055', 'W060', 'W061', 'W066', 'W070', 'W072', 'W075', 'W079', 'W080', 'W099']

def generate_pairs(writers, mode="train"):
    genuine_pairs = []
    impostor_pairs = []

    # GENUINE: same writer
    for writer in writers:
        if len(writer_data[writer]["N"]) < 2 or len(writer_data[writer]["F"]) < 2:
            continue

        for _ in range(20):
            s1_N, s2_N = random.sample(writer_data[writer]["N"], 2)
            s1_F, s2_F = random.sample(writer_data[writer]["F"], 2)
            genuine_pairs.append((s1_N, s1_F, s2_N, s2_F, 0))

    # All unique pairs
    all_writer_pairs = list(combinations(writers, 2))
    random.shuffle(all_writer_pairs)

    # Ensure every writer is covered
    writers_used = set()
    coverage_writer_pairs = []
    target_count = len(genuine_pairs)
    for w1, w2 in all_writer_pairs:
        if w1 not in writers_used or w2 not in writers_used:
            coverage_writer_pairs.append((w1, w2))
            writers_used.add(w1)
            writers_used.add(w2)
        if len(writers_used) == len(writers):
            break

    remaining_pairs = [p for p in all_writer_pairs if p not in coverage_writer_pairs]
    random.shuffle(remaining_pairs)

    candidate_impostor_pairs = []

    # Generate from coverage pairs
    while len(candidate_impostor_pairs) < target_count:
        for w1, w2 in all_writer_pairs:
            if (
                writer_data[w1]["N"]
                and writer_data[w1]["F"]
                and writer_data[w2]["N"]
                and writer_data[w2]["F"]
            ):
                s1_N = random.choice(writer_data[w1]["N"])
                s1_F = random.choice(writer_data[w1]["F"])
                s2_N = random.choice(writer_data[w2]["N"])
                s2_F = random.choice(writer_data[w2]["F"])
                candidate_impostor_pairs.append((s1_N, s1_F, s2_N, s2_F, 1))
                if len(candidate_impostor_pairs) >= target_count:
                    break

    impostor_pairs = candidate_impostor_pairs[:target_count]


    print(f"{mode.title()} Set — Genuine: {len(genuine_pairs)}, Impostor: {len(impostor_pairs)}")

    all_pairs = genuine_pairs + impostor_pairs
    random.shuffle(all_pairs)

    df_all = pd.DataFrame(all_pairs, columns=["sample_1_N", "sample_1_F", "sample_2_N", "sample_2_F", "label"])
    df_all.to_csv(f"uni_{mode}.csv", index=False)
    df_all.to_parquet(
        f"uni_{mode}.parquet",
        index=False,
        compression="snappy"
    )
    return genuine_pairs, impostor_pairs

generate_pairs(train_writers, mode="train")
generate_pairs(test_writers, mode="test")
generate_pairs(test_writers, mode="test")
