In [None]:
import cv2
import numpy as np
import zipfile
import os
import gdown
from matplotlib import pyplot as plt
from tqdm import tqdm
import shutil

# Creating Textures - University Dataset

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "binary.zip", quiet=False)

with zipfile.ZipFile("binary.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset")

## Extracting Lines

In [None]:
def segment_lines(image, threshold=350000, line_height=180, size=1600):
    projection = np.sum(image, axis=1) #horizontal projection
    line_start = None
    lines = []

    for i, value in enumerate(projection):
        if value < threshold and line_start is None:
            line_start = i
        elif value >= threshold and line_start is not None:
            lines.append((line_start, i))
            line_start = None

    cropped_lines = []
    for (start, end) in lines:
        if end - start < 20:
            continue
        center = (start + end) // 2
        bottom = max(0, center - line_height // 2)
        top = min(image.shape[0], center + line_height // 2)
        cropped_lines.append(cv2.resize(image[bottom:top, :], (size, line_height)))
    return clean_lines(cropped_lines)

def clean_lines(dirty_lines):
    cleaned_lines = []
    for line in dirty_lines:
        contours, _ = cv2.findContours(
            cv2.bitwise_not(line), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        mask = np.zeros_like(line)
        min_area = 40
        for contour in contours:
            middle_point = False
            for point in contour:
                if point[0][1] > 40 and point[0][1] < 140:
                    middle_point = True
                    break
            if cv2.contourArea(contour) > min_area and middle_point:
                cv2.drawContours(mask, [contour], -1, 255, -1)
        cleaned_binary = cv2.bitwise_not(cv2.bitwise_and(cv2.bitwise_not(line), mask))
        cleaned_lines.append(cleaned_binary)
    return cleaned_lines, dirty_lines


## Extracting Words

In [None]:
def segment_words(lines):
    cropped_words = []
    for line in lines:
        vertical_projection = np.sum(line, axis=0)
        word_threshold = 45000
        word_start = None
        words = []
        consecutive_count = 0
        min_consecutive = 15
        for i, value in enumerate(vertical_projection):
            if value < word_threshold:
                consecutive_count = 0
                if word_start is None:
                    word_start = i
            elif value >= word_threshold:
                if word_start is not None:
                    consecutive_count += 1
                    if consecutive_count >= min_consecutive:
                        words.append((word_start, i - min_consecutive + 1))
                        word_start = None
                        consecutive_count = 0
        for word in words:
            start, end = word
            cropped_words.append(line[:, start:end])
    return cropped_words

## Generating Texture - Line Filled

In [None]:
def generate_texture(cropped_words, file_name="unknown"):
    canvas_height = 1400
    canvas_width = 1400
    if not cropped_words:
        print(f"[WARNING] No cropped words for file: {file_name}")
        return np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255

    row_spacing = -100
    column_spacing = 0

    rows = []
    current_row = []
    current_width = 0

    for word in cropped_words:
        word_height, word_width = word.shape

        while word_width > 0:
            remaining_space = canvas_width - current_width

            if remaining_space >= word_width:
                current_row.append(word)
                current_width += word_width + column_spacing
                break
            else:
                if remaining_space > 0:
                    word_part = word[:, :remaining_space]
                    current_row.append(word_part)
                    word = word[:, remaining_space:]
                    word_width = word.shape[1]
                rows.append(current_row)
                current_row = []
                current_width = 0

    if not rows:
        print(f"[WARNING] No rows could be constructed from the cropped words in file: {file_name}")
        return np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255

    if current_row:
        rows.append(current_row)


    used_width = sum(w.shape[1] for w in rows[-1]) + column_spacing * (len(rows[-1]) - 1)
    remaining_space = canvas_width - used_width
    for w in rows[0]:
      if remaining_space <= 0:
          break
      h, w_w = w.shape
      if w_w + column_spacing <= remaining_space:
          rows[-1].append(w)
          remaining_space -= (w_w + column_spacing)
      else:
          slice_width = remaining_space
          rows[-1].append(w[:, :slice_width])
          remaining_space = 0


    texture = np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255

    y_offset = 0
    row_index = 0

    while y_offset < canvas_height:
        row = rows[row_index % len(rows)]
        x_offset = 0
        max_row_height = max(word.shape[0] for word in row)

        if y_offset + max_row_height > canvas_height:
            break

        for word in row:
            word_height, word_width = word.shape
            if y_offset + word_height <= canvas_height and x_offset + word_width <= canvas_width:
                word_text = np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255
                word_text[y_offset:y_offset + word_height, x_offset:x_offset + word_width] = word
                texture = cv2.bitwise_not(cv2.bitwise_or(cv2.bitwise_not(texture), cv2.bitwise_not(word_text)))
            x_offset += word_width + column_spacing

        y_offset += max_row_height + row_spacing
        row_index += 1

    crop_size = 1350

    h, w = texture.shape
    y_start = (h - crop_size) // 2
    x_start = (w - crop_size) // 2
    y_end   = y_start + crop_size
    x_end   = x_start + crop_size

    texture = texture[
        y_start : y_end,
        x_start : x_end
    ]

    return texture

In [None]:
def split_texture(texture):

    patch_size = 450
    patches = []

    for i in range(3):  
        for j in range(3):  
            y_start = i * patch_size
            x_start = j * patch_size
            patch = texture[y_start:y_start + patch_size, x_start:x_start + patch_size]
            patches.append(patch)
    return patches

## Create Textures for all samples

In [None]:
input_root = 'dataset/binary'
output_root = 'texture'

for writer_id in tqdm(sorted(os.listdir(input_root))):
    writer_path = os.path.join(input_root, writer_id) #dataset/binary/W001
    if not os.path.isdir(writer_path):
        continue

    for filename in sorted(os.listdir(writer_path)):
        if not filename.endswith(".png"):
            continue

        image_path = os.path.join(writer_path, filename) #dataset/binary/W001/W001_S01_F.png
        try:
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            cleaned_lines, _ = segment_lines(image)

            cropped_words = segment_words(cleaned_lines)

            texture = generate_texture(cropped_words, file_name = image_path)

            patches = split_texture(texture)

            parts = filename.split("_") #W001,S01,F.png
            base_id = parts[0] #W001
            sample_id = parts[1] #S01
            speed = parts[2].split(".")[0] #F
            subfolder = f"{sample_id}_{speed}" #S01_F

            save_dir = os.path.join(output_root, base_id, subfolder) #texture/#W001/S01_F
            os.makedirs(save_dir, exist_ok=True)

            for i, patch in enumerate(patches):
                patch_filename = f"{base_id}_{sample_id}_{speed}_T{i+1}.png" #W001_S01_F_T1.png
                patch_path = os.path.join(save_dir, patch_filename) #texture/#W001/S01_F/W001_S01_F_T1.png
                cv2.imwrite(patch_path, patch)

        except Exception as e:
          print(f"[ERROR] Failed processing: {image_path}")
          print(f"        {type(e).__name__}: {e}")

## Upload Textures to Drive

In [None]:
shutil.make_archive("texture", 'zip', "texture")

from google.colab import files
files.download("texture.zip")

from google.colab import drive
drive.mount('/content/drive')

target_directory = "/content/drive/MyDrive/Research Level 4/Implementations/Writer Verification Rashmi/data"
!cp texture.zip "{target_directory}/"

# Creating Textures - CVL Dataset

## Binarize Images

In [None]:
DATASET_PATH = "cvl"
BINARY_OUTPUT_PATH = "data/binary_cvl"
RAW_OUTPUT_PATH = "data/raw_cvl"

file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "cvl.zip", quiet=False)
with zipfile.ZipFile("cvl.zip", 'r') as zip_ref:
    zip_ref.extractall()

writer_folders = sorted(os.listdir(DATASET_PATH))
total_images = sum(
    len(os.listdir(os.path.join(DATASET_PATH, wf)))
    for wf in writer_folders
    if os.path.isdir(os.path.join(DATASET_PATH, wf))
)

def read_image(image_path, resize=True, size=1600):
    image = cv2.imread(image_path)
    if resize:
        image = cv2.resize(image, (size, size))
    return image

def binarize_only(input):
    image = input.copy()
    image = cv2.GaussianBlur(image, (5, 5), 0)

    raw_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 0, 255, cv2.THRESH_OTSU)
    return raw_image, binary

def preprocess(writer_path, img_file):
    img_path = os.path.join(writer_path, img_file)
    image = read_image(img_path)
    raw_image, binary_image = binarize_only(image)

    writer_folder = os.path.basename(writer_path)
    binary_output_folder = os.path.join(BINARY_OUTPUT_PATH, writer_folder)
    raw_output_folder = os.path.join(RAW_OUTPUT_PATH, writer_folder)

    os.makedirs(binary_output_folder, exist_ok=True)
    os.makedirs(raw_output_folder, exist_ok=True)

    cv2.imwrite(os.path.join(binary_output_folder, img_file), binary_image)
    cv2.imwrite(os.path.join(raw_output_folder, img_file), raw_image)

with tqdm(
    total=total_images, desc="Preprocessing", position=0, leave=True
) as progress_bar:
    for writer_folder in writer_folders:
        writer_path = os.path.join(DATASET_PATH, writer_folder)

        if not os.path.isdir(writer_path) or not writer_folder.startswith("CVL"):
            print("Found folder:", writer_folder)
            continue

        images = sorted(os.listdir(writer_path))

        for img_file in images:
            print(f"Processinggg {img_file} in {writer_folder}")
            preprocess(writer_path, img_file)
            progress_bar.update(1)

In [None]:
!zip -r data.zip data/
from google.colab import files
files.download('data.zip')

!zip -r binary_cvl.zip data/binary_cvl
!zip -r raw_cvl.zip data/raw_cvl

from google.colab import drive
drive.mount('/content/drive')

target_directory = "/content/drive/MyDrive/Research Level 4/Implementations/Writer Verification Rashmi"
!cp raw_cvl.zip "{target_directory}/"

target_directory = "/content/drive/MyDrive/Research Level 4/Implementations/Writer Verification Rashmi"
!cp binary_cvl.zip "{target_directory}/"

## Extracting Lines

In [None]:
file_id = "file_id"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "binary_cvl.zip", quiet=False)

with zipfile.ZipFile("binary_cvl.zip", 'r') as zip_ref:
    zip_ref.extractall("cvl_dataset")

In [None]:
def segment_lines(image, threshold=380000, line_height=180, size=1600):
    projection = np.sum(image, axis=1)
    line_start = None
    lines = []

    for i, value in enumerate(projection):
        if value < threshold and line_start is None:
            line_start = i
        elif value >= threshold and line_start is not None:
            lines.append((line_start, i))
            line_start = None

    cropped_lines = []
    for (start, end) in lines:
        if end - start < 20:
            continue
        center = (start + end) // 2
        bottom = max(0, center - line_height // 2)
        top = min(image.shape[0], center + line_height // 2)
        cropped_lines.append(cv2.resize(image[bottom:top, :], (size, line_height)))
    return clean_lines(cropped_lines)

def clean_lines(dirty_lines):
    cleaned_lines = []
    for line in dirty_lines:
        contours, _ = cv2.findContours(
            cv2.bitwise_not(line), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        mask = np.zeros_like(line)
        min_area = 40
        for contour in contours:
            middle_point = False
            for point in contour:
                if point[0][1] > 40 and point[0][1] < 140:
                    middle_point = True
                    break
            if cv2.contourArea(contour) > min_area and middle_point:
                cv2.drawContours(mask, [contour], -1, 255, -1)
        cleaned_binary = cv2.bitwise_not(cv2.bitwise_and(cv2.bitwise_not(line), mask))
        cleaned_lines.append(cleaned_binary)
    return cleaned_lines, dirty_lines


## Extracting Words

In [None]:
def segment_words(lines):
    cropped_words = []
    for line in lines:
        vertical_projection = np.sum(line, axis=0)
        word_threshold = 44000
        word_start = None
        words = []
        consecutive_count = 0
        min_consecutive = 20
        for i, value in enumerate(vertical_projection):
            if value < word_threshold:
                consecutive_count = 0
                if word_start is None:
                    word_start = i
            elif value >= word_threshold:
                if word_start is not None:
                    consecutive_count += 1
                    if consecutive_count >= min_consecutive:
                        words.append((word_start, i - min_consecutive + 1))
                        word_start = None
                        consecutive_count = 0
        for word in words:
            start, end = word
            cropped_words.append(line[:, start:end])

    return cropped_words

## Generating Texture - Line Filled

In [None]:
def generate_texture(cropped_words, file_name="unknown"):
    canvas_height = 1400
    canvas_width = 1400
    if not cropped_words:
        print(f"[WARNING] No cropped words for file: {file_name}")
        return np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255

    row_spacing = -100
    column_spacing = 0

    rows = []
    current_row = []
    current_width = 0

    for word in cropped_words:
        word_height, word_width = word.shape

        while word_width > 0:
            remaining_space = canvas_width - current_width

            if remaining_space >= word_width:
                current_row.append(word)
                current_width += word_width + column_spacing
                break
            else:
                if remaining_space > 0:
                    word_part = word[:, :remaining_space]
                    current_row.append(word_part)
                    word = word[:, remaining_space:]
                    word_width = word.shape[1]
                rows.append(current_row)
                current_row = []
                current_width = 0

    if not rows:
        print(f"[WARNING] No rows could be constructed from the cropped words in file: {file_name}")
        return np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255

    if current_row:
        rows.append(current_row)


    used_width = sum(w.shape[1] for w in rows[-1]) + column_spacing * (len(rows[-1]) - 1)
    remaining_space = canvas_width - used_width
    for w in rows[0]:
      if remaining_space <= 0:
          break
      h, w_w = w.shape
      if w_w + column_spacing <= remaining_space:
          rows[-1].append(w)
          remaining_space -= (w_w + column_spacing)
      else:
          slice_width = remaining_space
          rows[-1].append(w[:, :slice_width])
          remaining_space = 0


    texture = np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255

    y_offset = 0
    row_index = 0

    while y_offset < canvas_height:
        row = rows[row_index % len(rows)]
        x_offset = 0
        max_row_height = max(word.shape[0] for word in row)

        if y_offset + max_row_height > canvas_height:
            break

        for word in row:
            word_height, word_width = word.shape
            if y_offset + word_height <= canvas_height and x_offset + word_width <= canvas_width:
                word_text = np.ones((canvas_height, canvas_width), dtype=np.uint8) * 255
                word_text[y_offset:y_offset + word_height, x_offset:x_offset + word_width] = word
                texture = cv2.bitwise_not(cv2.bitwise_or(cv2.bitwise_not(texture), cv2.bitwise_not(word_text)))
            x_offset += word_width + column_spacing

        y_offset += max_row_height + row_spacing
        row_index += 1

    crop_size = 1350

    h, w = texture.shape
    y_start = (h - crop_size) // 2
    x_start = (w - crop_size) // 2
    y_end   = y_start + crop_size
    x_end   = x_start + crop_size

    texture = texture[
        y_start : y_end,
        x_start : x_end
    ]

    return texture

In [None]:
def split_texture(texture):

    patch_size = 450
    patches = []

    for i in range(2): 
        for j in range(2):  
            y_start = i * patch_size
            x_start = j * patch_size
            patch = texture[y_start:y_start + patch_size, x_start:x_start + patch_size]
            patches.append(patch)
    return patches

## Creating Textures for all samples

In [None]:
from tqdm import tqdm

input_root = 'cvl_dataset/data/binary_cvl'
output_root = 'texture_cvl'

for writer_id in tqdm(sorted(os.listdir(input_root))):
    writer_path = os.path.join(input_root, writer_id)
    if not os.path.isdir(writer_path):
        continue

    for filename in sorted(os.listdir(writer_path)):
        if not filename.endswith(".png"):
            continue

        image_path = os.path.join(writer_path, filename)

        try:
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            cleaned_lines, _ = segment_lines(image)

            cropped_words = segment_words(cleaned_lines)

            texture = generate_texture(cropped_words,file_name = image_path)

            patches = split_texture(texture)

            parts = filename.split(".")
            base_id = parts[0]
            writer_id = parts[0].split("_")[0]
            # sample_id = parts[1]
            # speed = parts[2].split(".")[0]

            save_dir = os.path.join(output_root,writer_id,base_id)
            os.makedirs(save_dir, exist_ok=True)

            for i, patch in enumerate(patches):
                patch_filename = f"{base_id}_T{i+1}.png"
                patch_path = os.path.join(save_dir, patch_filename)
                cv2.imwrite(patch_path, patch)

        except Exception as e:
            print(f"[ERROR] Failed processing: {image_path}")
            print(f"        {type(e).__name__}: {e}")

## Upload Textures to Drive

In [None]:
import shutil
shutil.make_archive("texture_cvl", 'zip', "texture_cvl")

from google.colab import files
files.download("texture_cvl.zip")

from google.colab import drive
drive.mount('/content/drive')

target_directory = "/content/drive/MyDrive/Research Level 4/Implementations/Writer Verification Rashmi/data"
!cp texture_cvl.zip "{target_directory}/"