In [None]:
import tensorflow as tf
import gdown
import os
import random
import pandas as pd
from itertools import combinations
import zipfile
print(tf.__version__)
!python --version
print("TensorFlow version:", tf.__version__)
print(tf.config.list_physical_devices('GPU'))

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "texture_uni.zip", quiet=False)
with zipfile.ZipFile("texture_uni.zip", 'r') as zip_ref:
    zip_ref.extractall("")

Downloading...
From (original): https://drive.google.com/uc?id=17wriWuCNfNfSDmR94jWScO03ItmWPmnv
From (redirected): https://drive.google.com/uc?id=17wriWuCNfNfSDmR94jWScO03ItmWPmnv&confirm=t&uuid=f6c2d543-0fe2-49eb-afd6-cb8d7116a76a
To: /content/texture_uni.zip
100%|██████████| 279M/279M [00:04<00:00, 66.5MB/s]


## Parquet Creation Testing - Updated

In [None]:
BASE_PATH = "texture_uni"
random.seed(42)

writer_data = {}

for writer_id in os.listdir(BASE_PATH):
    writer_path = os.path.join(BASE_PATH, writer_id)
    if not os.path.isdir(writer_path):
        continue

    writer_data[writer_id] = {"N": [], "F": []}

    for sample_folder in os.listdir(writer_path):
        full_path = os.path.join(BASE_PATH, writer_id, sample_folder).replace("\\", "/")
        if sample_folder.endswith("_N"):
            writer_data[writer_id]["N"].append(full_path)
        elif sample_folder.endswith("_F"):
            writer_data[writer_id]["F"].append(full_path)

all_writers = list(writer_data.keys())
train_writers = ['W001', 'W002', 'W003', 'W004', 'W005', 'W006', 'W007', 'W009', 'W010', 'W011', 'W012', 'W015', 'W017', 'W018', 'W019', 'W020', 'W021', 'W022', 'W023', 'W024', 'W025', 'W028', 'W029', 'W030', 'W031', 'W033', 'W034', 'W035', 'W038', 'W039', 'W040', 'W041', 'W042', 'W043', 'W044', 'W045', 'W046', 'W047', 'W049', 'W050', 'W051', 'W052', 'W053', 'W054', 'W058', 'W059', 'W062', 'W063', 'W067', 'W068', 'W069', 'W071', 'W073', 'W074', 'W076', 'W077', 'W081', 'W085', 'W086', 'W087', 'W088', 'W089', 'W090', 'W091', 'W092', 'W093', 'W095', 'W096', 'W097', 'W100']
test_writers = ['W013', 'W016', 'W027', 'W032', 'W048', 'W064', 'W065', 'W078', 'W082', 'W083', 'W084', 'W094', 'W098', 'W101']

def generate_pairs(writers, mode="train"):
    selected_data = {w: writer_data[w] for w in writers if w in writer_data}

    # Generate genuine pairs
    genuine_pairs = []
    for writer, samples in selected_data.items():
        if len(samples["N"]) < 2 or len(samples["F"]) < 2:
            continue
        for _ in range(13):
            s1_N, s2_N = random.sample(samples["N"], 2)
            s1_F, s2_F = random.sample(samples["F"], 2)
            genuine_pairs.append((s1_N, s1_F, s2_N, s2_F, 0))

    print(f"{mode.title()} - Genuine pairs: {len(genuine_pairs)}")

    # Generate impostor pairs
    impostor_pairs = []
    all_writer_pairs = list(combinations(writers, 2)) 
    random.shuffle(all_writer_pairs)

    writers_covered = set()
    for w1, w2 in all_writer_pairs:
        if (
            selected_data[w1]["N"] and selected_data[w1]["F"] and
            selected_data[w2]["N"] and selected_data[w2]["F"]
        ):
            for _ in range(2):
                s1_N = random.choice(selected_data[w1]["N"])
                s1_F = random.choice(selected_data[w1]["F"])
                s2_N = random.choice(selected_data[w2]["N"])
                s2_F = random.choice(selected_data[w2]["F"])
                impostor_pairs.append((s1_N, s1_F, s2_N, s2_F, 1))
                # writers_covered.update([w1, w2])
        if len(impostor_pairs) > len(genuine_pairs):
            break

    print(f"{mode.title()} - Impostor pairs: {len(impostor_pairs)}")

    all_pairs = genuine_pairs + impostor_pairs[:len(genuine_pairs)]
    random.shuffle(all_pairs)

    df = pd.DataFrame(all_pairs, columns=["sample_1_N", "sample_1_F", "sample_2_N", "sample_2_F", "label"])
    df.to_csv(f"uni_{mode}.csv", index=False)
    df.to_parquet(f"uni_{mode}.parquet", index=False, compression="snappy")

    return genuine_pairs, impostor_pairs

generate_pairs(train_writers, mode="train")
generate_pairs(test_writers, mode="test")