# Load Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
import math
from math import ceil
import cv2
from typing import Optional, Literal, Tuple, List
import pyarrow as pa, pyarrow.parquet as pq
from tqdm import tqdm

In [3]:
# ----- Load Full Dataset -----
data = pd.read_parquet(r"data\raw_data\train_raw.parquet")

# ----- Split By Source -----
printed = data[data["source"] == "printed"]
written = data[data["source"] == "handwritten"]

# ----- Pre-sampling size -----
print("Printed Size :", len(printed))
print("Handwritten Size :", len(written))

Printed Size : 1733904
Handwritten Size : 6482


## Data Preprocessing


Based on exploratory data analysis performed on this dataset, we will perform the fillowing preprocessing techniques:
- Downsampling (printed data)
- Conversion to Grayscale
- Resizing to Fixed Dimensions
- Noise Removal (conditional)
- Binarization (conditional)
- Normalization
- Padding and Alignment
- Augmentation (handwritten data)

### Down Sampling

We will down-sample printed and data to 100_000 samples, and augment handwritten data to increase size to 100_000. This will fix the issue of class imbalance while prserving enough data for training.

In [8]:
# ----- Downsample Printed Data -----
printed = printed.sample(n = 100000, random_state=42)

# ----- Post-sampling size -----
print("Handwritten Size post-sampling:", len(printed))

Handwritten Size post-sampling: 100000


### Preprocessing Functions

#### Image Byte Handling  

This step ensures that images stored in raw byte format can be safely processed.  
- **`get_bytes`**: Standardizes the input by extracting raw bytes, whether the image is stored directly as `bytes`, `bytearray`, `memoryview`, or inside a dictionary with a `"bytes"` key. Raises an error if the input is invalid.  
- **`decode_image_cv2`**: Takes raw bytes and decodes them into an OpenCV-compatible image (`NumPy array`) using `cv2.imdecode`. If decoding fails, an error is raised.  

Together, these functions make the pipeline robust for handling different raw image storage formats and converting them into usable image arrays.


In [10]:
# ----- Obtain Raw Bytes -----
def get_bytes(x) -> bytes:
    if isinstance(x, dict) and "bytes" in x:
        v = x["bytes"]
        if isinstance(v, (bytes, bytearray, memoryview)):
            return bytes(v)
    if isinstance(x, (bytes, bytearray, memoryview)):
        return bytes(x)
    raise TypeError("Image field must be bytes or dict containing key 'bytes' with bytes")

# ----- Decode Compressed Image Bytes -----
def decode_image_cv2(image_bytes: bytes) -> np.ndarray:
    buf = np.frombuffer(image_bytes, dtype=np.uint8)
    img = cv2.imdecode(buf, cv2.IMREAD_UNCHANGED)
    if img is None:
        raise ValueError("Failed to decode image bytes")
    return img

#### Grayscale Conversion  

This step converts input images into grayscale format to simplify processing and reduce computational cost.  
- If the image is already 2D (grayscale), it is returned as-is.  
- If the image has 3 channels (BGR) or 4 channels (BGRA), OpenCV is used to convert it to grayscale.  
- A fallback conversion ensures robustness in case the input has unexpected channel arrangements.  

Grayscale images are essential for tasks like blur detection, thresholding, and OCR preprocessing since color information is often unnecessary.


In [11]:
# ----- Convert to Grayscale -----
def to_grayscale(img: np.ndarray) -> np.ndarray:
    if img.ndim == 2:
        return img
    if img.ndim == 3:
        if img.shape[2] == 3:
            return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        if img.shape[2] == 4:
            return cv2.cvtColor(img, cv2.COLOR_BGRA2GRAY)
        
    # ----- Fallback -----
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

#### Resizing to a Fixed Height  

This step ensures all images have a consistent height (`target_h`, default = 128) while preserving their aspect ratio.  
- If the image already matches the target height, it is returned unchanged.  
- Otherwise, the width is scaled proportionally to maintain aspect ratio.  
- **Downscaling** uses `INTER_AREA` (better for shrinking), while **upscaling** uses `INTER_LINEAR` (smoother enlargement).  

This standardization is important for batching images and feeding them into models that require uniform input dimensions.


In [12]:
# ----- Resize to Consistent Height -----
def resize_to_fixed_height(gray: np.ndarray, target_h: int = 128) -> np.ndarray:
    h, w = gray.shape[:2]
    if h == target_h:
        return gray
        
    scale = target_h / float(h)
    new_w = max(1, int(round(w * scale)))
    method = cv2.INTER_AREA if target_h < h else cv2.INTER_LINEAR
    
    return cv2.resize(gray, (new_w, target_h), interpolation=method)

#### Noise Removal (Optional)  

This step reduces unwanted noise in grayscale images to improve clarity for OCR and preprocessing tasks.  
- **No Denoising** → returns the image unchanged.  
- **Median Filter** → effective for removing salt-and-pepper noise while preserving edges.  
- **Bilateral Filter** → smooths the image while keeping edges sharp, useful for text-heavy images.  

The method can be chosen via the `method` parameter (`"median"` or `"bilateral"`). If none is specified, the original image is retained.


In [13]:
# ----- Noise Removal -----
def denoise_optional(gray: np.ndarray, method: Optional[Literal["median", "bilateral"]] = None) -> np.ndarray:
    if method is None:
        return gray
    if method == "median":
        return cv2.medianBlur(gray, ksize=3)
    if method == "bilateral":
        return cv2.bilateralFilter(gray, d=5, sigmaColor=20, sigmaSpace=10)
        
    raise ValueError("Unsupported denoise method")

#### Binarization (Optional)  

This step converts grayscale images into black-and-white (binary) format, making text more distinct and improving OCR performance.  
- **No Binarization** → image remains in grayscale.  
- **Adaptive Mean Thresholding** → threshold value is calculated as the mean of neighboring pixels within a block.  
- **Adaptive Gaussian Thresholding** → threshold value is computed using a Gaussian-weighted sum of neighboring pixels.  

Both adaptive methods help handle uneven lighting conditions and preserve readability of handwritten/printed text.


In [14]:
# ----- Binarization -----
def binarize_optional(gray: np.ndarray, method: Optional[Literal["adaptive_mean", "adaptive_gaussian"]] = None) -> np.ndarray:
    if method is None:
        return gray
    block_size = 25
    C = 10
    if method == "adaptive_mean":
        return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, block_size, C)
    if method == "adaptive_gaussian":
        return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, C)
    
    raise ValueError("Unsupported binarization method")

#### Normalization  

This step scales pixel intensity values from **[0, 255]** to a floating-point range of **[0.0, 1.0]**.  
Normalization ensures consistency across images, improves numerical stability, and helps models train more efficiently.  



In [15]:
# ----- Normalization -----
def normalize_01(img_u8: np.ndarray) -> np.ndarray:
    return (img_u8.astype(np.float32) / 255.0)

#### Padding  

This step standardizes the image width to a fixed target while preserving height.  
- If the image is wider than the target, it is **cropped**.  
- If it is narrower, it is **padded** with a constant value (default: white background = `1.0`).  
- Padding can be applied **left-aligned** or **center-aligned**, ensuring consistent dimensions for model training.  


In [16]:
# ----- Padding -----
def pad_to_width(img: np.ndarray, target_w: int, pad_value: float = 1.0, align: Literal["left", "center"] = "left") -> np.ndarray:
    h, w = img.shape
    
    if w == target_w:
        return img
    if w > target_w:
        return img[:, :target_w]

    out = np.full((h, target_w), fill_value=pad_value, dtype=img.dtype)
    
    if align == "left":
        out[:, :w] = img
    elif align == "center":
        offset = (target_w - w) // 2
        out[:, offset:offset + w] = img
    else:
        raise ValueError("Unsupported align")
        
    return out

### Preprocessing Pipeline

In [18]:
# ----- Preprocessing Pipeline For Single Image -----
def preprocess_image_bytes(image_bytes: bytes,
                           target_h: int = 128,
                           denoise: Optional[Literal["median", "bilateral"]] = None,
                           binarize: Optional[Literal["adaptive_mean", "adaptive_gaussian"]] = None,
                           pad_width: Optional[int] = None,
                           pad_align: Literal["left", "center"] = "left",
                           pad_value: float = 1.0) -> np.ndarray:

    img = decode_image_cv2(image_bytes)
    gray = to_grayscale(img)
    gray = resize_to_fixed_height(gray, target_h=target_h)
    gray = denoise_optional(gray, method=denoise)
    gray = binarize_optional(gray, method=binarize)
    arr = normalize_01(gray)  # float32 [0,1]

    if pad_width is not None:
        arr = pad_to_width(arr, target_w=pad_width, pad_value=pad_value, align=pad_align)
        
    return arr

In [19]:
# ----- Preprocessing Pipeline For Complete Dataframe -----
def preprocess_dataframe(df: pd.DataFrame,
                         image_col: str = "image",
                         label_col: str = "text",
                         source_col: str = "source",
                         target_h: int = 128,
                         denoise: Optional[str] = None,
                         binarize: Optional[str] = None,
                         pad_width: Optional[int] = None,
                         pad_align: str = "left",
                         pad_value: float = 1.0) -> Tuple[List[np.ndarray], List[str], List[str]]:

    images, labels, sources = [], [], []
    
    for _, row in df.iterrows():
        b = get_bytes(row[image_col])
        arr = preprocess_image_bytes(
            b,
            target_h=target_h,
            denoise=denoise,
            binarize=binarize,
            pad_width=pad_width,
            pad_align=pad_align,
            pad_value=pad_value,
        )
        
        images.append(arr)
        labels.append(row[label_col])
        sources.append(row[source_col])
        
    return images, labels, sources

## Preprocessing Implementation

In [21]:
# ----- Define Defaults for Current Dataset -----
TARGET_H = 128
DENOISE = None       
BINARIZE = None         
PAD_WIDTH = None        
PAD_ALIGN = "left"
PAD_VALUE = 1.0

# ----- Preprocess Printed Dataset -----
printed_imgs, printed_labels, printed_sources = preprocess_dataframe(
    printed,
    image_col="image",
    label_col="text",
    source_col="source",
    target_h=TARGET_H,
    denoise=DENOISE,
    binarize=BINARIZE,
    pad_width=PAD_WIDTH,
    pad_align=PAD_ALIGN,
    pad_value=PAD_VALUE,
)

# ----- Preprocess Handwritten Dataset -----
hand_imgs, hand_labels, hand_sources = preprocess_dataframe(
    written,
    image_col="image",
    label_col="text",
    source_col="source",
    target_h=TARGET_H,
    denoise=DENOISE,
    binarize=BINARIZE,
    pad_width=PAD_WIDTH,
    pad_align=PAD_ALIGN,
    pad_value=PAD_VALUE,
)

## Data Augmentation

### Augmentation and Serialization

#### Configuration Setup

- **Random Generator**: `rng = np.random.default_rng(123)` → ensures reproducible results.  
- **Augmentation Size**: `TARGET_TOTAL = 100_000` → total samples to generate.  
- **Serialization Path**: `OUT_PATH = "data\handwritten_aug.parquet"` → output file location.  
- **Memory Efficiency**:  
  - `BATCH_SIZE = 256` → batch size for processing.  
  - `ELASTIC_P = 0.6` → probability of elastic distortion.  
  - `MAX_WIDTH_FOR_ELASTIC = 1200` → max image width for elastic transform.  


In [24]:
# ----- Set Default Range Object -----
rng = np.random.default_rng(123)

# ----- Augmentation Size -----
TARGET_TOTAL = 100_000

# ----- Define Path for Serialization -----
OUT_PATH = r"data\handwritten_aug.parquet"

# ----- Defaults for Memmory Efficiency -----
BATCH_SIZE = 256           
ELASTIC_P = 0.6            
MAX_WIDTH_FOR_ELASTIC = 1200

### Primitive Augmentation Functions

#### Augmentation Functions

- **Affine Transforms (`rand_affine`)**  
  Applies random rotation, scaling, shear, and translation to simulate writing variability.  

- **Elastic Distortion (`elastic_distort`)**  
  Introduces non-linear warping using displacement fields, mimicking natural handwriting irregularities.  

- **Stroke Variability (`stroke_variation`)**  
  Randomly dilates or erodes strokes to vary line thickness.  

- **Blur or Noise (`blur_or_noise`)**  
  Adds Gaussian blur or random noise to simulate scan imperfections.  

- **Brightness & Contrast (`brightness_contrast`)**  
  Adjusts pixel intensity and contrast to mimic lighting/scanning variations.  

- **Per-Image Augment (`augment_once`)**  
  Combines the above transforms with probabilistic application, ensuring diversity while maintaining legibility.  


In [26]:
# ----- Affine Transforms -----
def rand_affine(img, rot_deg=12, scale_range=(0.92, 1.08), shear_deg=8, shift_frac=0.06):
    h, w = img.shape
    angle = rng.uniform(-rot_deg, rot_deg)
    scale = rng.uniform(*scale_range)
    shear = np.deg2rad(rng.uniform(-shear_deg, shear_deg))
    tx = rng.uniform(-shift_frac, shift_frac) * w
    ty = rng.uniform(-shift_frac, shift_frac) * h
    M_rot = cv2.getRotationMatrix2D((w/2, h/2), angle, scale)
    M_shear = np.array([[1, np.tan(shear), 0], [0, 1, 0]], dtype=np.float32)
    M = M_shear @ np.vstack([M_rot, [0,0,1]])
    M = M[:2, :]
    out = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
    T = np.array([[1, 0, tx], [0, 1, ty]], dtype=np.float32)
    out = cv2.warpAffine(out, T, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
    return out.astype(np.float32, copy=False)

# ----- Elastic Distortion -----
def elastic_distort(img, alpha=12.0, sigma=6.0, max_width_for_elastic=1200):
    h, w = img.shape
    if w > max_width_for_elastic:
        return img
    dx = rng.normal(0, sigma, size=(h, w)).astype(np.float32)
    dy = rng.normal(0, sigma, size=(h, w)).astype(np.float32)
    dx = cv2.GaussianBlur(dx, (0,0), sigmaX=sigma).astype(np.float32) * (alpha / max(h, w))
    dy = cv2.GaussianBlur(dy, (0,0), sigmaX=sigma).astype(np.float32) * (alpha / max(h, w))
    x = np.arange(w, dtype=np.float32)
    y = np.arange(h, dtype=np.float32)
    map_x, map_y = np.meshgrid(x, y)
    map_x = (map_x + dx).astype(np.float32, copy=False)
    map_y = (map_y + dy).astype(np.float32, copy=False)
    out = cv2.remap(img, map_x, map_y, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
    return out.astype(np.float32, copy=False)

# ----- Stroke Variability -----
def stroke_variation(img, p=0.6):
    if rng.random() >= p:
        return img
    k = rng.integers(1, 2+1)
    kernel = np.ones((k, k), np.uint8)
    u8 = np.clip(img*255, 0, 255).astype(np.uint8)
    if rng.random() < 0.5:
        u8 = cv2.dilate(u8, kernel, iterations=1)
    else:
        u8 = cv2.erode(u8, kernel, iterations=1)
    return (u8.astype(np.float32) / 255.0)

# ----- Conditional Introduction of Blur or Noise -----
def blur_or_noise(img, p_blur=0.4, p_noise=0.5):
    out = img
    if rng.random() < p_blur:
        k = int(rng.choice([3,5]))
        sigma = float(rng.uniform(0.5, 1.2))
        out = cv2.GaussianBlur(out, (k,k), sigmaX=sigma)
    if rng.random() < p_noise:
        sigma = float(rng.uniform(0.005, 0.02))
        noise = rng.normal(0, sigma, size=out.shape).astype(np.float32)
        out = np.clip(out + noise, 0.0, 1.0)
    return out.astype(np.float32, copy=False)

# ----- Brightness and Contrast Variablity -----
def brightness_contrast(img, br_range=(-0.06, 0.06), ct_range=(0.92, 1.10)):
    beta = float(rng.uniform(*br_range))
    alpha = float(rng.uniform(*ct_range))
    out = np.clip(img * alpha + beta, 0.0, 1.0)
    return out.astype(np.float32, copy=False)

# ----- Per Image Augment -----
def augment_once(img, elastic_p=0.6, max_width_for_elastic=1200):
    out = img
    if rng.random() < 0.9:
        out = rand_affine(out)
    if rng.random() < elastic_p:
        out = elastic_distort(out, max_width_for_elastic=max_width_for_elastic)
    out = stroke_variation(out, p=0.6)
    out = blur_or_noise(out, p_blur=0.4, p_noise=0.5)
    out = brightness_contrast(out)
    out = np.clip(out, 0.0, 1.0).astype(np.float32, copy=False)
    # Simple legibility guard
    m = float(out.mean())
    if (m < 0.08 or m > 0.92) and rng.random() < 0.5:
        out = brightness_contrast(img, br_range=(-0.03,0.03), ct_range=(0.95,1.05))
        out = np.clip(out, 0.0, 1.0).astype(np.float32, copy=False)
    return out

### Array Encoding

#### PNG Encoding (`encode_png_bytes`)
Converts a float image array (`[0,1]` range) into PNG-encoded bytes:
- Ensures input is a NumPy float32 array.  
- Scales values to `[0,255]` and casts to `uint8`.  
- Uses OpenCV’s `cv2.imencode` to encode as PNG.  
- Returns the raw PNG bytes (suitable for serialization/storage).  


In [37]:
def encode_png_bytes(arr_float01) -> bytes:
    print("DEBUG:", type(arr_float01), getattr(arr_float01, "shape", None))  # <--- add this
    arr = np.array(arr_float01, dtype=np.float32)  # ensure numpy array
    u8 = np.clip(arr * 255.0, 0, 255).astype(np.uint8)
    ok, buf = cv2.imencode(".png", u8)
    if not ok:
        raise ValueError("Encoding failed")
    return buf.tobytes()

### Augmentation Streaming

Generates image–text–source triplets with augmentation until a target dataset size is reached:

- **Inputs**  
  - `base_images`, `base_labels`, `base_sources` → original dataset.  
  - `target_total` → total samples to generate.  
  - `max_aug_per_image` → limit on augmentations per base image.  
  - `batch_size` → controls memory efficiency during processing.  
  - `elastic_p`, `max_width_for_elastic` → parameters for elastic distortion.  

- **Process**  
  1. **Seed Batches**: Add original images (PNG-encoded) until `target_total` or dataset size is reached.  
  2. **Augmentation Loop**: Apply `augment_once` on base images to fill up to `target_total`.  
  3. Tracks progress with `tqdm`.  

- **Outputs**  
  Returns three lists:  
  - `col_image` → PNG-encoded augmented images.  
  - `col_text` → corresponding text labels.  
  - `col_source` → source metadata.  


In [30]:
# ----- Function to Define Augmented Columns -----
def build_augmented_columns(base_images, base_labels, 
                            base_sources,
                            target_total: int,
                            max_aug_per_image: int = 25,
                            batch_size: int = 256,
                            elastic_p: float = 0.6,
                            max_width_for_elastic: int = 1200):
    
    col_image, col_text, col_source = [], [], []

    p_total = tqdm(total=target_total, desc="Augmenting images", unit="img")

    # ----- Seed Batches -----
    n = len(base_images)
    produced = 0
    i = 0
    while produced < min(target_total, n):
        j = min(i + batch_size, n, target_total)
        for a, l, s in zip(base_images[i:j], base_labels[i:j], base_sources[i:j]):
            col_image.append(encode_png_bytes(a))
            col_text.append(l)
            col_source.append(s)
        produced += (j - i)
        p_total.update(j - i)
        i = j
        if produced >= target_total:
            break

    # ----- Augment to TARGET_TOTAL -----
    remaining = target_total - produced
    if remaining > 0:
        from math import ceil
        per_image = min(max_aug_per_image, max(1, ceil(remaining / max(1, len(base_images)))))
        idx = 0
        batch_count = 0
        while produced < target_total:
            img = base_images[idx]
            lab = base_labels[idx]
            src = base_sources[idx]
            aug = augment_once(img, elastic_p=elastic_p, max_width_for_elastic=max_width_for_elastic)
            col_image.append(encode_png_bytes(aug))
            col_text.append(lab)
            col_source.append(src)

            produced += 1
            batch_count += 1
            p_total.update(1)

            if batch_count >= batch_size:
                batch_count = 0

            idx += 1
            if idx >= len(base_images):
                idx = 0
                per_image -= 1
                if per_image <= 0 and produced < target_total:
                    per_image = 1

    p_total.close()
    return col_image, col_text, col_source

In [31]:
# ----- Save Dataframe to Parquet -----
def to_parquet(out_path: str, col_image, col_text, col_source):
    total_rows = len(col_image)
    p_save = tqdm(total=total_rows, desc="Saving to Parquet", unit="row")
    df = pd.DataFrame({
        "image": col_image,
        "text": col_text,
        "source": col_source,
    })
    p_save.update(total_rows)
    df.to_parquet(out_path, index=False) 
    p_save.close()
    return out_path, total_rows

In [32]:
col_image, col_text, col_source = build_augmented_columns(
    base_images=hand_imgs,
    base_labels=hand_labels,
    base_sources=hand_sources,
    target_total=TARGET_TOTAL,          
    max_aug_per_image=25,               
    batch_size=BATCH_SIZE,              
    elastic_p=ELASTIC_P,                
    max_width_for_elastic=MAX_WIDTH_FOR_ELASTIC  
)

Augmenting images: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [47:38<00:00, 34.98img/s]


#### Streaming Parquet Writer (`to_parquet_streaming`)

Efficiently saves images, texts, and sources into a Parquet file in small batches to reduce memory usage.

- **Helper (`safe_encode`)**  
  Ensures images are stored as PNG-encoded bytes.  
  - If already `bytes`, keeps as is.  
  - Otherwise, encodes using `encode_png_bytes`.  

- **Process**  
  1. Defines a Parquet schema: `image (binary)`, `text (string)`, `source (string)`.  
  2. Iterates through the dataset in chunks (`rows_per_group`).  
  3. Encodes images, builds an Arrow Table, and writes it to the Parquet file.  
  4. Appends each chunk to disk, printing progress logs.  

- **Parameters**  
  - `path` → output Parquet file path.  
  - `images`, `texts`, `sources` → dataset arrays/lists.  
  - `compression` → Parquet compression codec (default `"snappy"`).  
  - `rows_per_group` → number of rows written per batch (default `1000`).  


In [41]:
import pyarrow as pa
import pyarrow.parquet as pq

# ----- Function to Encode if Needed -----
def safe_encode(img):
    """Return bytes: keep if already encoded, otherwise encode."""
    if isinstance(img, (bytes, bytearray)):
        return img
    return encode_png_bytes(img)  # your existing function

# ----- Streaming Save -----
def to_parquet_streaming(path, images, texts, sources, compression="snappy", rows_per_group=1000):
    """
    Stream images/text/source to Parquet in small batches to save memory.
    """
    n = len(images)

    # Define schema
    schema = pa.schema([
        ("image", pa.binary()),
        ("text", pa.string()),
        ("source", pa.string()),
    ])

    # Create Parquet writer
    with pq.ParquetWriter(path, schema, compression=compression) as writer:
        for i in range(0, n, rows_per_group):
            j = min(i + rows_per_group, n)

            # Encode images only if needed
            img_bytes = [safe_encode(img) for img in images[i:j]]

            # Create Arrow Table for this chunk
            table = pa.table({
                "image": img_bytes,
                "text": texts[i:j],
                "source": sources[i:j],
            }, schema=schema)

            # Append to parquet file
            writer.write_table(table)

            print(f"Written rows {i}:{j} to {path}")

In [None]:
DATA_DIR = r"C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data"
os.makedirs(DATA_DIR, exist_ok=True)

# ----- Save Augmented Handwritten -----
hand_path = os.path.join(DATA_DIR, "handwritten_aug_streaming.parquet")
to_parquet_streaming(hand_path, col_image, col_text, col_source, rows_per_group=1000)

# ----- Save Printed -----
print_path = os.path.join(DATA_DIR, "printed_streaming.parquet")
to_parquet_streaming(print_path, printed_imgs, printed_labels, printed_sources, rows_per_group=1000)

Written rows 0:1000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.parquet
Written rows 1000:2000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.parquet
Written rows 2000:3000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.parquet
Written rows 3000:4000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.parquet
Written rows 4000:5000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.parquet
Written rows 5000:6000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.parquet
Written rows 6000:7000 to C:\Users\Shashwat Kumar\Desktop\CodeHub\Projects\Optical Character Recognition\data\handwritten_aug_streaming.par