# Dataset Access
This notebook contains code used for accessing handwriting images dataset.<br>
Author: Shreyansh Tripathi<br>
email: [shreyanshtripathi03@gmail.com](Hidden_landing_URL)

The first version of this notebook is for creating a dummy dataset which can be extended easily to access bigger and advanced datasets.

# Create Object Definitions for lmdb objects
To create lmdb image objects, a class is made which involves function for creating objects and a function for getting images back from the database. This class is necessary to be shipped with the data in order to reconstuct original images from the lmdb objects.<br>
The object has following fields:<br>
- channels - channels in the image
- shape - dimensions of the image(in pixels)
- image - byteform of the actual image
- ALLlabel - label representing class id of each image and refers to list index in the ALLtext field
- ALLtext - contains actual words
- SET - contains 0 if image is in the validation set and 1 if image is in the training set.

In [9]:
#class for creating lmdb objects and returning images from objects
class ImageObject:
    def __init__(self, image, label, text, trnORval):
        # Dimensions of image for reconstruction - not really necessary 
        # for this dataset, but some datasets may include images of 
        # varying sizes
        self.channels = 1
        self.shape = image.shape[:2]
        self.image = image.tobytes()
        self.ALLlabel = label
        self.ALLtext = text
        self.SET = trnORval
        
    def get_image(self):
        """ Returns the image as a numpy array. """
        image = np.frombuffer(self.image, dtype=np.uint8)
        return image.reshape(*self.shape)

# Reading Functions
The functions created will return lists of images, labels, text, names, validation set, and training set. The functions use libraries for various storage methods to access the data and store them in lists


In [10]:
import pickle
import lmdb
import h5py
import numpy as np
import csv
from pathlib import Path
from PIL import ImageDraw, ImageFont, Image
from PIL import ImageFilter

def read_many_disk():
    
    images, labels, text, trn, val, names = [], [], [], [], [], []
    # Loop over all IDs and read each image in one by one
    for y in range(1000):
        disk_dir = Path("datagen_disk/"+str(y+1))
        directory = Path("datagen_disk")
        for x in range(10):    
            images.append(np.array(Image.open(disk_dir / f"{x}.png")))
    
    with open(directory / "ALLlabels.csv", "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=" ", quotechar="|", quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            labels.append(int(row[0]))
          
    with open(directory / "ALLnames.csv", "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=" ", quotechar="|", quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            names.append(Path(row[0]))
          
    with open(directory / "ALLtext.csv", "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=" ", quotechar="|", quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            text.append(str(row[0]))
           
    with open(directory / "TRNInd.csv", "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=" ", quotechar="|", quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            trn.append(int(row[0]))
    
    with open(directory / "VALInd.csv", "r") as csvfile:
        reader = csv.reader(csvfile, delimiter=" ", quotechar="|", quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            val.append(int(row[0]))
    
    return images, labels, text, names, trn, val

def read_many_lmdb():
    
    images, labels, text, trnORval = [], [], [], []
    lmdb_dir = Path("datagen_lmdb")
    env = lmdb.open(str(lmdb_dir / "10000_lmdb"), readonly=True)
    
    # Start a new read transaction
    with env.begin() as txn:
        # Read all images in one single transaction, with one lock, improves time taken to read images
        for image_id in range(10000):
            data = txn.get(f"{image_id:08}".encode("ascii"))
            # Accessing the Image object 
            # that is stored as the value
            image_object = pickle.loads(data)
            # Retrieve the fields for the object
            images.append(image_object.get_image())
            labels.append(image_object.ALLlabel)
            text.append(image_object.ALLtext)
            trnORval.append(image_object.SET)
    env.close()
    return images, labels, text, trnORval

def read_many_hdf5():
    
    images, labels, text, trn, val = [], [], [], [], []
    hdf5_dir = Path("datagen_hdf5")
    # Open the HDF5 file
    file = h5py.File(hdf5_dir / f"{10000}_many.h5", "r+")

    images = np.array(file["/images"]).astype("uint8")
    labels = np.array(file["/ALLlabels"]).astype("uint16")
    text = np.array(file["/ALLtext"]).astype("str")
    val = np.array(file["/VALInd"]).astype("uint16")
    trn = np.array(file["/TRNInd"]).astype("uint16")
    
    return images, labels, text, val, trn

_read_many_funcs = dict(
    disk=read_many_disk, lmdb=read_many_lmdb, hdf5=read_many_hdf5
)

# Time taken for reading
The time taken for reading is given below for different functions

In [11]:
from timeit import timeit

read_many_timings = {"disk": [], "hdf5": [], "lmdb": []}

for method in ("disk", "hdf5", "lmdb"):
    t = timeit(
    "_read_many_funcs[method]()",
    setup="",
    number=1,
    globals=globals(),
    )
    read_many_timings[method].append(t)

        # Print out the method, cutoff, and elapsed time
    print(f"Method: {method}, Time usage: {t}")


Method: disk, Time usage: 2.1330112789983104
Method: hdf5, Time usage: 0.09784609599955729
Method: lmdb, Time usage: 2.1818189559999155


# Display hdf5 data
The hdf5 data is displayed with the help of function below

In [12]:
#Sample code to show contents of hdf5 database
images, labels, text, trn, val = read_many_hdf5()
b = []
for i in range(1000):
    a = text[i][0]
    b.append(a)
print(b)    
print( max(b, key=len) )

['suburbed', 'levulin', 'preprint', 'interregal', 'serophthisis', 'puncture', 'Ottawa', 'ostensible', 'geognosist', 'conversationable', 'querulosity', 'proxyship', 'nephrocyte', 'variability', 'undevelopable', 'unusurping', 'sphenoethmoidal', 'depressingly', 'zygostyle', 'treating', 'bathybian', 'dawkin', 'Sarcoptes', 'ceramic', 'midautumn', 'siderealize', 'laic', 'eyewear', 'elaeagnaceous', 'Halosphaera', 'ponto', 'ethnocentric', 'shaped', 'theosophistical', 'Dyotheletical', 'redistrainer', 'Trachoma', 'genialness', 'amentia', 'circumcise', 'idylist', 'eglantine', 'neurochemistry', 'columned', 'overrace', 'slumberousness', 'Anti', 'hypocone', 'Archibuteo', 'impressment', 'masterwork', 'bavary', 'pycnodont', 'translocalization', 'striga', 'remint', 'twitlark', 'bemusement', 'bolograph', 'hydromedusa', 'Gregarinida', 'enforceability', 'Gerbillinae', 'familiarism', 'premierjus', 'sapan', 'decemcostate', 'do', 'ramshackled', 'Moslemic', 'hiodont', 'beeishness', 'outdo', 'umbellulate', 'si

In [13]:
#Code to show 36th image of the database 
print(images[35].shape)
a = Image.fromarray(images[1346])
a.show()

(48, 128)


# Display lmdb data
The hdf5 data is displayed with the help of function below

In [14]:
#Code to access and show 36th image of the database 
images, labels, text, trnorval = read_many_lmdb()
print(text)    
print(max(text, key=len))
print(images[35].shape)
a = Image.fromarray(images[13].astype('uint8'))
a.show()

['suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'suburbed', 'levulin', 'levulin', 'levulin', 'levulin', 'levulin', 'levulin', 'levulin', 'levulin', 'levulin', 'preprint', 'preprint', 'preprint', 'preprint', 'preprint', 'preprint', 'preprint', 'preprint', 'preprint', 'preprint', 'interregal', 'interregal', 'interregal', 'interregal', 'interregal', 'interregal', 'interregal', 'interregal', 'interregal', 'interregal', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'serophthisis', 'puncture', 'puncture', 'puncture', 'puncture', 'puncture', 'puncture', 'puncture', 'puncture', 'puncture', 'puncture', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'Ottawa', 'ostensible', 'ostensible', 'ostensible', 'ostensible', 'ostensible', 'ostensible', 'ostensible', 'ostensible', 

# Calculating the Mean and Variance of Foreground and Background pixels in IAM dataset
The code below is to calculate Mean and Variance of Foreground and Background pixels in IAM dataset which is used for image generation in our dataset.

The code uses opencv to adaptively threshold images from IAM dataset to distinguish and create background and frontground masks. The mean of pixel values is calculated from the original image using the mask.

In [15]:
import cv2
import math
import numpy as np
from matplotlib import pyplot as plt
import glob
import statistics as s

cv_img = []
#change directory path accordingly
for img in glob.glob("/home/shreyansh/Downloads/filesimages/*.png"):
    n = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
    cv_img.append(n)
mean_lista=[]
for image in cv_img:    
    a = cv2.mean(image)
    mean_lista.append(a[0])
print(s.mean(mean_lista))
mean_list = []
mean = 0
var = []
for image in cv_img:    
    img = cv2.medianBlur(image,5)
    th3 = cv2.adaptiveThreshold(img,1,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
                cv2.THRESH_BINARY,11,2)
    a = cv2.mean(image, mask=th3)
    mean_list.append(a[0])
    mean = mean+a[0]
    
mean = mean/len(mean_list)
print(s.mean(mean_list))

for item in mean_list:
    item = (item-mean)**2
    var.append(item)
print(len(var))
print(math.sqrt(s.median(var)))


StatisticsError: mean requires at least one data point