In [40]:
import os
import cv2  # or use PIL if you prefer
from tqdm import tqdm
import numpy as np


### Configration|

In [None]:
input_dir = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/formula_images_processed'
output_dir = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/processed_img'
os.makedirs(output_dir, exist_ok=True)


In [42]:
files = os.listdir(input_dir)
print("Total files found:", len(files))
print("First 5 files:", files[:5])

Total files found: 103536
First 5 files: ['7ae1ba33b9.png', '2a5984cda9.png', '586da719db.png', '3111dc88fe.png', '3b8a16803b.png']


In [43]:
# Desired image size
target_size = 192


In [44]:
def resize_and_pad(img, size=192):
    h, w = img.shape[:2]
    scale = min(size / w, size / h)
    new_w, new_h = int(w * scale), int(h * scale)

    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
    canvas = np.ones((size, size), dtype=np.uint8) * 255  # white background

    # Compute offsets to center the image
    x_offset = (size - new_w) // 2
    y_offset = (size - new_h) // 2

    canvas[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
    return canvas


In [45]:
print("Processing images...")
for filename in tqdm(os.listdir(input_dir)):
    if filename.lower().endswith('.png'):
        in_path = os.path.join(input_dir, filename)
        out_path = os.path.join(output_dir, filename)

        img = cv2.imread(in_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"[!] Skipped {filename}: couldn't read.")
            continue

        try:
            final_img = resize_and_pad(img, target_size)
            cv2.imwrite(out_path, final_img)
        except Exception as e:
            print(f"[!] Error with {filename}: {e}")

Processing images...


100%|██████████| 103536/103536 [00:51<00:00, 1997.90it/s]


In [47]:
matrix_dir = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/processed_matrices'  # Directory for storing matrices
os.makedirs(matrix_dir, exist_ok=True)

In [48]:
print("Converting processed images to matrices and saving as .npy files...")
for filename in tqdm(os.listdir(output_dir)):
    if filename.lower().endswith('.png'):
        img_path = os.path.join(output_dir, filename)

        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"[!] Skipped {filename}: couldn't read.")
            continue

        try:
            # Convert image to matrix (NumPy array) and save as .npy file
            matrix_filename = filename.replace('.png', '.npy')
            matrix_path = os.path.join(matrix_dir, matrix_filename)
            np.save(matrix_path, img)

        except Exception as e:
            print(f"[!] Error with {filename}: {e}")

Converting processed images to matrices and saving as .npy files...


100%|██████████| 103536/103536 [00:36<00:00, 2818.60it/s]


In [49]:
# New block to convert all processed images into a single .npy file

image_matrices = []  # List to hold all processed images as matrices

print("Creating a single .npy file for all images...")
for filename in tqdm(os.listdir(output_dir)):
    if filename.lower().endswith('.png'):
        img_path = os.path.join(output_dir, filename)

        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            print(f"[!] Skipped {filename}: couldn't read.")
            continue

        try:
            # Add the image matrix to the list
            image_matrices.append(img)

        except Exception as e:
            print(f"[!] Error with {filename}: {e}")

# Convert the list of image matrices to a single NumPy array
image_matrices_array = np.array(image_matrices)

# Save the array to a single .npy file in the current directory
single_npy_path = './all_images.npy'  # Output the .npy file in the current directory
np.save(single_npy_path, image_matrices_array)

print(f"All images saved as a single .npy file: {single_npy_path}")

Creating a single .npy file for all images...


100%|██████████| 103536/103536 [00:20<00:00, 4978.13it/s]


All images saved as a single .npy file: ./all_images.npy
