In [None]:
import tensorflow as tf
import pandas as pd
import gdown
import zipfile
import os
import itertools
import random
from itertools import combinations
print(tf.__version__)
!python --version
print("TensorFlow version:", tf.__version__)
print(tf.config.list_physical_devices('GPU'))

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "binary.zip", quiet=False)

with zipfile.ZipFile("binary.zip", 'r') as zip_ref:
    zip_ref.extractall()

## Parquet Creation Testing

In [3]:
def path_from_name(writer, img_name):
    parts = img_name.split('_')  # e.g., ['W083', 'S02', 'F']
    subfolder = f"{parts[1]}_{parts[2]}"
    return f"{writer}/{subfolder}"

In [None]:
BASE_DIR = "binary"
test_writers = ['W013', 'W016', 'W027', 'W032', 'W048', 'W056', 'W064', 'W065', 'W078', 'W082', 'W083', 'W084', 'W094', 'W098', 'W101']
random.seed(42)

same_writer_pairs = []
diff_writer_pairs = []
writer_images = {}

for writer in test_writers:
    path = os.path.join(BASE_DIR, writer)
    if not os.path.isdir(path):
        continue
    images = [f[:-4] for f in os.listdir(path) if f.endswith(".png")]
    if len(images) >= 2:
        writer_images[writer] = sorted(images)
for writer, imgs in writer_images.items():
    pairs = list(itertools.combinations(imgs, 2))  
    same_writer_pairs.extend([(writer, a, writer, b) for a, b in pairs])

needed_diff_pairs = len(same_writer_pairs)
existing_diff_pairs = set()

while len(diff_writer_pairs) < needed_diff_pairs:
    w1, w2 = random.sample(test_writers, 2)
    imgs1 = writer_images.get(w1, [])
    imgs2 = writer_images.get(w2, [])
    if not imgs1 or not imgs2:
        continue
    i1 = random.choice(imgs1)
    i2 = random.choice(imgs2)
    pair = (w1, i1, w2, i2)
    if pair not in existing_diff_pairs:
        diff_writer_pairs.append(pair)
        existing_diff_pairs.add(pair)

all_rows = []

for w1, i1, w2, i2 in same_writer_pairs:
    all_rows.append({"sample_1": path_from_name(w1, i1), "sample_2": path_from_name(w2, i2), "label": 0})
for w1, i1, w2, i2 in diff_writer_pairs:
    all_rows.append({"sample_1": path_from_name(w1, i1), "sample_2": path_from_name(w2, i2), "label": 1})

df = pd.DataFrame(all_rows)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.to_csv("test_writer_combinations.csv", index=False)
df.to_parquet("test_writer_combinations.parquet", index=False)

print(f"Same-writer pairs: {len(same_writer_pairs)}")
print(f"Different-writer pairs: {len(diff_writer_pairs)}")


## CVL Dataset

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "binary_cvl.zip", quiet=False)

with zipfile.ZipFile("binary_cvl.zip", 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
def path_from_name(writer, img_name):
    print(writer,img_name)
    parts = img_name.split('_')
    subfolder = f"{parts[1]}_{parts[2]}"
    return f"{writer}/{subfolder}"

In [None]:
BASE_DIR = "binary_cvl"
test_writers = ['CVL0717', 'CVL0309', 'CVL1120', 'CVL0215', 'CVL0095', 'CVL0245', 'CVL0240', 'CVL0391', 'CVL0176', 'CVL0020', 'CVL0524', 'CVL0482', 'CVL0709', 'CVL0528', 'CVL0477', 'CVL0218', 'CVL0181', 'CVL0080', 'CVL0963', 'CVL0573', 'CVL0157', 'CVL0216', 'CVL0705', 'CVL0220', 'CVL0403', 'CVL0058', 'CVL0540', 'CVL0230', 'CVL0256', 'CVL0050', 'CVL0198', 'CVL1133', 'CVL0096', 'CVL0153', 'CVL0183', 'CVL0394', 'CVL0098', 'CVL0935', 'CVL0582', 'CVL0291', 'CVL0411', 'CVL0313', 'CVL0232', 'CVL0585', 'CVL0244', 'CVL0028', 'CVL0023']
random.seed(42)

same_writers = []
diff_writer_pairs = []
writer_images = {}

for writer in test_writers:
    path = os.path.join(BASE_DIR, writer)
    if not os.path.isdir(path):
        continue
    images = [f[:-4] for f in os.listdir(path) if f.endswith(".png")]
    if images:
      writer_images[writer] = images
    pairs = list(itertools.combinations(images, 2))
    for pair in pairs:
      same_writers.append((pair[0],pair[1],0))

writer_list = list(writer_images.keys())

used_writers = set()
diff_writer_pairs = set()

while len(diff_writer_pairs) < len(same_writers) or len(used_writers) < len(writer_list):
    w1, w2 = random.sample(writer_list, 2)

    img1 = random.choice(writer_images[w1])
    img2 = random.choice(writer_images[w2])

    pair = tuple(sorted((img1, img2)))

    if pair not in diff_writer_pairs:
        diff_writer_pairs.add(pair)
        used_writers.update([w1, w2])

diff_writer_labeled = [(p[0], p[1], 1) for p in list(diff_writer_pairs)[:len(same_writers)]]

all_pairs = same_writers + diff_writer_labeled

df_pairs = pd.DataFrame(all_pairs, columns=['sample_1', 'sample_2', 'label'])
df_pairs = df_pairs.sample(frac=1, random_state=42).reset_index(drop=True)
df_pairs.to_csv('test_writer_pairs_cvl.csv', index=False)
df_pairs.to_parquet('test_writer_pairs_cvl.parquet', index=False)

print(f"Same-writer pairs: {len(same_writers)}")
print(f"Different-writer pairs: {len(diff_writer_pairs)}")


In [None]:
df = pd.read_parquet('cvl_test.parquet')
col1 = df['sample_1']
col2 = df['sample_2']
writers = set()
for writer in col1:
  parts = writer.split('/')
  writers.add(parts[1])
for writer in col2:
  parts = writer.split('/')
  writers.add(parts[1])

## IAM Dataset

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "binary_iam.zip", quiet=False)

with zipfile.ZipFile("binary_iam.zip", 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
BASE_DIR = "binary_iam"
test_writers = []
random.seed(42)

writers = os.listdir(BASE_DIR)
print(writers)

for writer in writers:
  test_writers.append(writer)

same_writers = []
diff_writer_pairs = []
writer_images = {}

for writer in test_writers:
    path = os.path.join(BASE_DIR, writer)
    if not os.path.isdir(path):
        continue
    images = [f[:-4] for f in os.listdir(path) if f.endswith(".png")]
    if not images:
      continue

    if writer == 'IAM000':
      images = random.sample(images, min(6, len(images)))
    writer_images[writer] = images
    pairs = list(itertools.combinations(images, 2))
    for pair in pairs:
      same_writers.append((pair[0],pair[1],0))

writer_list = list(writer_images.keys())

used_writers = set()
diff_writer_pairs = set()

while len(diff_writer_pairs) < len(same_writers) or len(used_writers) < len(writer_list):
    w1, w2 = random.sample(writer_list, 2)

    img1 = random.choice(writer_images[w1])
    img2 = random.choice(writer_images[w2])

    pair = tuple(sorted((img1, img2)))

    if pair not in diff_writer_pairs:
        diff_writer_pairs.add(pair)
        used_writers.update([w1, w2])

diff_writer_labeled = [(p[0], p[1], 1) for p in list(diff_writer_pairs)[:len(same_writers)]]

all_pairs = same_writers + diff_writer_labeled

df_pairs = pd.DataFrame(all_pairs, columns=['sample_1', 'sample_2', 'label'])
df_pairs = df_pairs.sample(frac=1, random_state=42).reset_index(drop=True)
df_pairs.to_csv('test_writer_pairs_iam.csv', index=False)
df_pairs.to_parquet('test_writer_pairs_iam.parquet', index=False)