In [None]:
import os
import random
from itertools import combinations
from collections import defaultdict
import csv
import gdown
import zipfile
import pandas as pd
import pyarrow
from sklearn.model_selection import train_test_split

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "texture_uni.zip", quiet=False)
with zipfile.ZipFile("texture_uni.zip", 'r') as zip_ref:
    zip_ref.extractall("texture_uni")

## Creating Combinations - University Dataset
#### Output - train.csv, test.csv

In [None]:
BASE_PATH = "texture_uni"
random.seed(42)

writer_data = {}

for writer_id in os.listdir(BASE_PATH):
    writer_path = os.path.join(BASE_PATH, writer_id) # texture/W001
    if not os.path.isdir(writer_path):
        continue
    writer_data[writer_id] = {}
    for sample in os.listdir(writer_path):
        sample_path = os.path.join(writer_path, sample) # texture/W001/S01_F
        if not os.path.isdir(sample_path):
            continue
        writer_data[writer_id][sample] = []
        for img in os.listdir(sample_path):
            if img.endswith(".png"):
                img = img.replace(".png","")
                rel_path = os.path.join(BASE_PATH, writer_id, sample, img) # texture/W001/S01_F/W001_S01_F_T1
                writer_data[writer_id][sample].append(rel_path)

all_writers = list(writer_data.keys())

train_writers, val_test_writers = train_test_split(
    all_writers,
    test_size=0.3,
    random_state=42
)

val_writers, test_writers = train_test_split(
    val_test_writers,
    test_size=0.5,
    random_state=42
)

def generate_pairs(writers, mode="train"):
    genuine_pairs = []
    impostor_pairs = []

    for writer in writers:
        samples = list(writer_data[writer].keys())
        if len(samples) < 2:
            continue

        for sample in samples:
            textures = writer_data[writer][sample]
            for i in range(len(textures)):
                for j in range(i + 1, len(textures)):
                    genuine_pairs.append((textures[i], textures[j], 0))

        for i in range(len(samples)):
            for j in range(i + 1, len(samples)):
                imgs1 = writer_data[writer][samples[i]]
                imgs2 = writer_data[writer][samples[j]]
                for img1 in imgs1:
                    for img2 in imgs2:
                        genuine_pairs.append((img1, img2, 0))

    impostor_set = set()

    while len(impostor_set) < len(genuine_pairs):
        w1, w2 = random.sample(writers, 2)
        sample1 = random.choice(list(writer_data[w1].keys()))
        sample2 = random.choice(list(writer_data[w2].keys()))
        img1 = random.choice(writer_data[w1][sample1])
        img2 = random.choice(writer_data[w2][sample2])

        pair = tuple(sorted([img1, img2]))
        if pair not in impostor_set:
            impostor_set.add(pair)
            impostor_pairs.append((img1, img2, 1))

    print(f"{mode.title()} Set — Genuine: {len(genuine_pairs)}, Impostor: {len(impostor_pairs)}")
    all_pairs = genuine_pairs + impostor_pairs
    random.shuffle(all_pairs)

    df = pd.DataFrame(all_pairs, columns=["sample_1", "sample_2", "label"])
    df.to_csv(f"uni_{mode}.csv", index=False)
    df.to_parquet(
        f"uni_{mode}.parquet",
        index=False,
        compression="snappy"
    )

generate_pairs(train_writers, mode="train")
generate_pairs(val_writers, mode="val")
generate_pairs(test_writers, mode="test")

## Creating Combinations - CVL Dataset

In [None]:
random.seed(42)

CVL_PATH = "texture_cvl"
same_writer_pairs = []
diff_writer_pairs = []

writer_data = defaultdict(lambda: defaultdict(list))  # writer_id -> sample_number -> [file paths]

for writer_id in os.listdir(CVL_PATH):
    writer_path = os.path.join(CVL_PATH, writer_id) # texture_cvl/CVL0001
    if not os.path.isdir(writer_path):
        continue

    for sample_folder in os.listdir(writer_path):
        sample_path = os.path.join(writer_path, sample_folder) # texture_cvl/CVL0001/CVL0001_1
        if not os.path.isdir(sample_path):
            continue

        for file in os.listdir(sample_path): # eg: CVL0001_1_T1.png
          if file.endswith(".png"):
              parts = file.split("_") # CVL0001/1/T1.png
              sample_num = parts[1] # 1
              file_path_no_ext = os.path.join(sample_path, file).replace(".png", "") # texture_cvl/CVL0001/CVL0001_1/CVL0001_1_T1
              writer_data[writer_id][sample_num].append(file_path_no_ext)

all_writers = list(writer_data.keys())
train_writers, val_test_writers = train_test_split(
    all_writers,
    test_size = 0.3,
    random_state = 42)

val_writers, test_writers = train_test_split(
    val_test_writers,
    test_size = 0.5,
    random_state = 42
)

def generate_pairs(writers, mode="train"):
    genuine_pairs = []
    impostor_pairs = []

    for writer in writers:
        samples = list(writer_data[writer].keys())

        for sample_num in samples:
            textures = writer_data[writer][sample_num]
            for i in range(len(textures)):
                for j in range(i + 1, len(textures)):
                    genuine_pairs.append((textures[i], textures[j], 0))

        for s1, s2 in combinations(samples, 2):
            for f1 in writer_data[writer][s1]:
                for f2 in writer_data[writer][s2]:
                    genuine_pairs.append((f1, f2, 0))

    impostor_set = set()
    while len(impostor_set) < len(genuine_pairs):
        w1, w2 = random.sample(writers, 2)
        sample1 = random.choice(list(writer_data[w1].keys()))
        sample2 = random.choice(list(writer_data[w2].keys()))
        img1 = random.choice(writer_data[w1][sample1])
        img2 = random.choice(writer_data[w2][sample2])
        pair = tuple(sorted([img1, img2]))
        if pair not in impostor_set:
            impostor_set.add(pair)
            impostor_pairs.append((img1, img2, 1))  # Label 1 for different writers

    all_pairs = genuine_pairs + impostor_pairs
    random.shuffle(all_pairs)

    df = pd.DataFrame(all_pairs, columns=["sample_1", "sample_2", "label"])
    df.to_csv(f"cvl_{mode}.csv", index=False)
    df.to_parquet(
        f"cvl_{mode}.parquet",
        index=False,
        compression="snappy"
    )
    print(f"{mode.title()} Set — Genuine: {len(genuine_pairs)}, Impostor: {len(impostor_pairs)}")

generate_pairs(train_writers, mode="train")
generate_pairs(val_writers, mode="val")
generate_pairs(test_writers, mode="test")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.concat(map(pd.read_csv, ['cvl_train.csv', 'uni_train.csv']), ignore_index=True)
df.to_csv("train.csv", index=False)

df = pd.concat(map(pd.read_csv, ['cvl_val.csv', 'uni_val.csv']), ignore_index=True)
df.to_csv("val.csv", index=False)

df = pd.concat(map(pd.read_csv, ['cvl_test.csv', 'uni_test.csv']), ignore_index=True)
df.to_csv("test.csv", index=False)

target_directory = "/content/drive/MyDrive/Research Level 4/Implementations/Writer Verification Rashmi/data/version_3/0.15 split/csv files"
!cp uni_train.csv "{target_directory}/"
!cp cvl_train.csv "{target_directory}/"
!cp uni_test.csv "{target_directory}/"
!cp cvl_test.csv "{target_directory}/"
!cp uni_val.csv "{target_directory}/"
!cp cvl_val.csv "{target_directory}/"
!cp test.csv "{target_directory}/"
!cp train.csv "{target_directory}/"
!cp val.csv "{target_directory}/"

In [None]:
df = pd.concat(map(pd.read_parquet, ['cvl_train.parquet', 'uni_train.parquet']), ignore_index=True)
df.to_parquet("train.parquet", index=False, compression="snappy")

df = pd.concat(map(pd.read_parquet, ['cvl_val.parquet', 'uni_val.parquet']), ignore_index=True)
df.to_parquet("val.parquet", index=False, compression="snappy")

df = pd.concat(map(pd.read_parquet, ['cvl_test.parquet', 'uni_test.parquet']), ignore_index=True)
df.to_parquet("test.parquet", index=False, compression="snappy")

target_directory = "/content/drive/MyDrive/Research Level 4/Implementations/Writer Verification Rashmi/data/version_3/0.15 split/parquet files"
!cp uni_train.parquet "{target_directory}/"
!cp cvl_train.parquet "{target_directory}/"
!cp uni_test.parquet "{target_directory}/"
!cp cvl_test.parquet "{target_directory}/"
!cp uni_val.parquet "{target_directory}/"
!cp cvl_val.parquet "{target_directory}/"
!cp test.parquet "{target_directory}/"
!cp train.parquet "{target_directory}/"
!cp val.parquet "{target_directory}/"
