## Image Pre-Processing

In [1]:
#import libraries

import os
import math
import cv2
import struct
import glob
from tqdm import tqdm
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import xml.etree.ElementTree as ET

## Data Preprocessing

First converting Images to GrayScale --> Then increasing channels to 3

In [2]:
pos_data = glob.glob('./marmot/marmot_dataset_v1.0/data/English/Positive/Raw' + '/*.bmp')
neg_data = glob.glob('./marmot/marmot_dataset_v1.0/data/English/Negative/Raw' + '/*.bmp')

In [3]:
pos_data

['./marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2006_3.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2010_5.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2013_63.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2013_64.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2014_4.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2014_6.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2018_4.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2018_8.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2019_2.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2019_3.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2020_8.bmp',
 './marmot/marmot_dataset_v1.0/data/English/Positive/Raw\\10.1.1.1.2023_31.bmp',
 './marmot/marmot_dataset_v1.0/data/E

In [4]:
PROCESSED_DATA = 'marmot_processed'
IMAGE_PATH = os.path.join(PROCESSED_DATA, 'image_v2')

In [5]:
new_h, new_w = 1024, 1024

In [7]:
for i, data in enumerate([neg_data, pos_data]):
    
    for j, img_path in enumerate(data):
        
        image_name = os.path.basename(img_path)
        image = Image.open(img_path).convert('LA')
        w, h = image.size
        
        #convert to RGB image
        image = image.resize((new_h, new_w)).convert("RGB")

        #save images and masks
        save_image_path = os.path.join(IMAGE_PATH, image_name.replace('bmp', 'jpg'))
        
        image.save(save_image_path)

In [8]:
processed_data = pd.read_csv('processed_data.csv')

In [9]:
processed_data['img_path'] = processed_data['img_path'].apply(lambda x: x.replace('image', 'image_v2'))

In [10]:
processed_data.to_csv('processed_data_v2.csv', index = False)

In [11]:
processed_data

Unnamed: 0,img_path,table_mask,col_mask,original_height,original_width,hasTable,table_count,col_count,table_bboxes,col_bboxes
0,marmot_processed\image_v2\10.1.1.1.2000_4.jpg,marmot_processed\table_mask\10.1.1.1.2000_4_ta...,marmot_processed\col_mask\10.1.1.1.2000_4_col_...,1008,768,0,0,0,[],[]
1,marmot_processed\image_v2\10.1.1.1.2004_4.jpg,marmot_processed\table_mask\10.1.1.1.2004_4_ta...,marmot_processed\col_mask\10.1.1.1.2004_4_col_...,1123,793,0,0,0,[],[]
2,marmot_processed\image_v2\10.1.1.1.2004_5.jpg,marmot_processed\table_mask\10.1.1.1.2004_5_ta...,marmot_processed\col_mask\10.1.1.1.2004_5_col_...,1123,793,0,0,0,[],[]
3,marmot_processed\image_v2\10.1.1.1.2005_12.jpg,marmot_processed\table_mask\10.1.1.1.2005_12_t...,marmot_processed\col_mask\10.1.1.1.2005_12_col...,1056,816,0,0,0,[],[]
4,marmot_processed\image_v2\10.1.1.1.2005_13.jpg,marmot_processed\table_mask\10.1.1.1.2005_13_t...,marmot_processed\col_mask\10.1.1.1.2005_13_col...,1056,816,0,0,0,[],[]
...,...,...,...,...,...,...,...,...,...,...
988,marmot_processed\image_v2\10.1.1.8.2182_6.jpg,marmot_processed\table_mask\10.1.1.8.2182_6_ta...,marmot_processed\col_mask\10.1.1.8.2182_6_col_...,1123,793,0,0,0,[],[]
989,marmot_processed\image_v2\10.1.1.8.2185_13.jpg,marmot_processed\table_mask\10.1.1.8.2185_13_t...,marmot_processed\col_mask\10.1.1.8.2185_13_col...,1056,816,1,1,0,"[[241, 541, 787, 682]]",[]
990,marmot_processed\image_v2\10.1.1.8.2185_14.jpg,marmot_processed\table_mask\10.1.1.8.2185_14_t...,marmot_processed\col_mask\10.1.1.8.2185_14_col...,1056,816,1,1,0,"[[277, 282, 736, 360]]",[]
991,marmot_processed\image_v2\10.1.1.8.2198_11.jpg,marmot_processed\table_mask\10.1.1.8.2198_11_t...,marmot_processed\col_mask\10.1.1.8.2198_11_col...,1056,816,1,1,0,"[[126, 609, 897, 794]]",[]
