In [41]:
import os
import cv2
import pandas as pd
import numpy as np
from tqdm import tqdm

In [42]:
train_csv_path = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/im2latex_train.csv'
val_csv_path = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/im2latex_validate.csv'
test_csv_path = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/im2latex_test.csv'
# formulas_csv_path = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/im2latex_formulas.norm.csv'
images_dir = '/media/arindam-shukla/Linux Storage/mathink/lm2LaTeX-100K/processed_img'  # Folder with cleaned 192x192 grayscale images
output_npy_path = "/media/arindam-shukla/Linux Storage/mathink"  # Will save here


In [43]:
# === LOAD CSVs ===
train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)
test_df = pd.read_csv(test_csv_path)

In [44]:
# === ✅ Add 'split' column manually
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'

In [45]:
# Combine all
all_data = pd.concat([train_df, val_df, test_df], ignore_index=True)

In [46]:
# Rename 'formula' column to 'label'
all_data.rename(columns={'formula': 'label'}, inplace=True)

In [47]:
# === ✅ Prepare data containers ===
image_matrices = []
labels = []
splits = []

In [48]:
print("Reading and storing images with labels and splits...")

for _, row in tqdm(all_data.iterrows(), total=len(all_data)):
    filename = row['image']
    label = row['label']
    split = row['split']
    
    img_path = os.path.join(images_dir, filename)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    
    if img is None:
        print(f"[!] Could not read {filename}. Skipping.")
        continue
    
    image_matrices.append(img)
    labels.append(label)
    splits.append(split)

Reading and storing images with labels and splits...


100%|██████████| 94000/94000 [00:27<00:00, 3378.81it/s]


In [49]:

# === ✅ Convert to numpy arrays
image_matrices = np.array(image_matrices)
labels = np.array(labels)
splits = np.array(splits)

In [50]:
# === ✅ Save all as single .npy file
np.save(output_npy_path, {
    'images': image_matrices,
    'labels': labels,
    'splits': splits
})
print(f"\n✅ Combined dataset saved to: {output_npy_path}")
print(f"Total images saved: {len(image_matrices)}")


✅ Combined dataset saved to: /media/arindam-shukla/Linux Storage/mathink
Total images saved: 94000
