### Preprocessing images and labels

In [1]:
import os
import pandas as pd
from PIL import Image
from torchvision.transforms import ToTensor, Pad
from src.tokenizer import Tokenizer
import gc  # Garbage Collector interface
import numpy as np


class PreProcessing:
    def __init__(self, data_dir="./data/dataset5"):
        self.data_dir = data_dir
        self.image_dir = os.path.join(data_dir, "formula_images")
        self.training_dir = os.path.join(data_dir, "training_56")
        self.input_data_dir = os.path.join(data_dir, "training_56")
        self.batch_size = 14000
        self.tokenizer_transformer = Tokenizer(True)
        self.tokenizer_gpt = Tokenizer(False)

    def load_data(self, filename):
        """Load DataFrame from a pickle file."""
        file_path = os.path.join(self.input_data_dir, filename)
        return pd.read_pickle(file_path)

    def load_image(self, image_filename):
        """Load an image file."""
        image_path = os.path.join(self.image_dir, image_filename)
        return Image.open(image_path)

    def load_data_batches(self, dataframe, batch_size):
        """Generator to load images in batches."""

        for start in range(0, len(dataframe), batch_size):
            new_rows = []
            end = start + batch_size
            batch = dataframe[start:end]
        

            for _, row in batch.iterrows():
                
                name = row["image"]
                label = row["word2id"]
                #PAD IMAGE
                image = self.load_image(name)
                image = image.convert("RGB")
                to_tensor = ToTensor()
                image = to_tensor(image)
                pad_height = 128 - image.size()[1]
                pad_width = 1088 - image.size()[2]
                pad_top = pad_height // 2
                pad_bottom = pad_height - pad_top
                pad_left = pad_width // 2
                pad_right = pad_width - pad_left
                padding = (pad_left, pad_top, pad_right, pad_bottom)
                pad_transform = Pad(padding, fill=1, padding_mode="constant")
                image = pad_transform(image)
                label_gpt = self.tokenizer_gpt.encode(label)
                label_transformer = self.tokenizer_transformer.encode(label)
                new_row = {'name': name , 'image': image, "label_gpt": label_gpt,"label_transformer":label_transformer}
                new_rows.append(new_row)
        
            new_batch_df = pd.DataFrame(new_rows)
            pickle_filename = f'batch_{start}_{batch_size}.pkl'
            new_batch_df.to_pickle(pickle_filename)
            print(pickle_filename)
            del new_rows, new_batch_df  # Delete large variables
            gc.collect()
        
    def preprocessing_data(self):
        """Load all Data and Images into a single DataFrame and list."""
        df_train = self.load_data("df_train.pkl")
        #df_test = self.load_data("df_test.pkl")
        #df_valid = self.load_data("df_valid.pkl")


        df_train = df_train.drop_duplicates(subset="image", keep="first")
        #df_test = df_test.drop_duplicates(subset="image", keep="first")
        #df_valid = df_valid.drop_duplicates(subset="image", keep="first")

        df_train_processes = pd.DataFrame({
            "name":[],
            "image": [],
            "label_gpt": [],
            "label_transformer":[]
        })

        self.load_data_batches(df_train,self.batch_size)

        df_train_processes.reset_index(drop=True, inplace=True)

        pickle_filename = 'df_train_processes.pkl'
        df_train_processes.to_pickle(pickle_filename)

        """
        df_test_processes = pd.DataFrame({
            "name":[],
            "image": [],
            "label_gpt": [],
            "label_transformer":[]
        })

        for new_batch_df in self.load_data_batches(df_test,self.batch_size):
            df_test_processes = pd.concat([df_test_processes,new_batch_df])

        df_test_processes.reset_index(drop=True, inplace=True)

        pickle_filename = 'df_test_processes.pkl'
        df_test_processes.to_pickle(pickle_filename)
        

       
        df_valid_processes = pd.DataFrame({
            "name":[],
            "image": [],
            "label_gpt": [],
            "label_transformer":[]
        })

        for new_batch_df in self.load_data_batches(df_valid,self.batch_size):
            df_valid_processes = pd.concat([df_valid_processes,new_batch_df])

        df_valid_processes.reset_index(drop=True, inplace=True)

        pickle_filename = 'df_valid_processes.pkl'
        df_valid_processes.to_pickle(pickle_filename)
        
        
        return 
        """


prepro = PreProcessing()
prepro.preprocessing_data()


df_test_processes.h5


MemoryError: Unable to allocate 29.1 GiB for an array with shape (1, 14000) and data type |S2229800

In [3]:
# Path to the HDF5 file
import pandas as pd
import gc  # Garbage collector interface

hdf5_file = 'df_train_processes.h5'

# List of pickle file paths
pickle_files = [
    "batch_0_14000.pkl",
    "batch_14000_14000.pkl",
    "batch_28000_14000.pkl",
    "batch_42000_14000.pkl",
    "batch_56000_14000.pkl",
 
]

'''
   "batch_70000_14000.pkl",
    "batch_84000_14000.pkl",
    "batch_98000_14000.pkl",
    "batch_112000_14000.pkl"
'''
# Use HDFStore to manage HDF5 file
with pd.HDFStore(hdf5_file, 'w', complevel=4, complib='blosc') as store:
    for file in pickle_files:
        # Load the DataFrame from pickle file
        temp_df = pd.read_pickle(file)
        
        # Append it to the HDF5 file
        store.append('data', temp_df, data_columns=True, index= False)

        # Optional: clear the temporary DataFrame from memory (this is automatic, but can be forced if needed)
        del temp_df

TypeError: Cannot serialize the column [image]
because its data contents are not [string] but [mixed] object dtype

In [2]:
df_test = pd.read_pickle("df_train_processes.pkl")


In [3]:
print(len(df_test))

14280


In [4]:
df_test2 = pd.read_pickle("df_valid_processes.pkl")


In [5]:
print(len(df_test2))


14280


### Process images

In [1]:
import os
import pandas as pd
from PIL import Image
from torchvision.transforms import ToTensor, Pad
from src.tokenizer import Tokenizer
import gc  # Garbage Collector interface
import numpy as np
import os
from torchvision.transforms import ToPILImage


class PreProcessing:
    def __init__(self, data_dir="./data/dataset5"):
        self.data_dir = data_dir
        self.image_dir = os.path.join(data_dir, "formula_images")
        self.training_dir = os.path.join(data_dir, "training_56")
        self.input_data_dir = os.path.join(data_dir, "training_56")
        self.batch_size = 2
        self.tokenizer_transformer = Tokenizer(True)
        self.tokenizer_gpt = Tokenizer(False)

    def load_data(self, filename):
        """Load DataFrame from a pickle file."""
        file_path = os.path.join(self.input_data_dir, filename)
        return pd.read_pickle(file_path)

    def load_image(self, image_filename):
        """Load an image file."""
        image_path = os.path.join(self.image_dir, image_filename)
        return Image.open(image_path)
    
    def save_tensor_as_png(self,tensor, output_dir, image_name):
        

        if not os.path.exists(output_dir):
            print("Output directory does not exist, creating now...")
            os.makedirs(output_dir)
            print("Directory created: ", output_dir)

        # Convert tensor to PIL image
        to_pil_image = ToPILImage()
        pil_image = to_pil_image(tensor)

        # Save the image
        file_path = os.path.join(output_dir, f"{image_name}")
        pil_image.save(file_path, format='PNG')
        print("Image successfully saved!")


    def load_data_batches(self, dataframe, batch_size):
        """Generator to load images in batches."""

        for start in range(0, len(dataframe), batch_size):
            new_rows = []
            end = start + batch_size
            batch = dataframe[start:end]
        

            for _, row in batch.iterrows():
                
                name = row["image"]
                #PAD IMAGE
                image = self.load_image(name)
                image = image.convert("RGB")
                to_tensor = ToTensor()
                image = to_tensor(image)
                pad_height = 128 - image.size()[1]
                pad_width = 1088 - image.size()[2]
                pad_top = pad_height // 2
                pad_bottom = pad_height - pad_top
                pad_left = pad_width // 2
                pad_right = pad_width - pad_left
                padding = (pad_left, pad_top, pad_right, pad_bottom)
                pad_transform = Pad(padding, fill=1, padding_mode="constant")
                image = pad_transform(image)

                self.save_tensor_as_png(image,"./images-post",name)
                
        
            
        
    def preprocessing_data(self):
        """Load all Data and Images into a single DataFrame and list."""
        df_train = self.load_data("df_train.pkl")
        df_test = self.load_data("df_test.pkl")
        #df_valid = self.load_data("df_valid.pkl")


        df_train = df_train.drop_duplicates(subset="image", keep="first")
        df_test = df_test.drop_duplicates(subset="image", keep="first")
        #df_valid = df_valid.drop_duplicates(subset="image", keep="first")


       

        self.load_data_batches(df_train,self.batch_size)
        self.load_data_batches(df_test,self.batch_size)

        

        
        

       
        

        
        #self.load_data_batches(df_valid,self.batch_size)

        
        
        
        return 


prepro = PreProcessing()
prepro.preprocessing_data()


Output directory does not exist, creating now...
Directory created:  ./images-post
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image successfully saved!
Image s