Author: Jitender Singh Virk ||
Date created: 25 Dec 2017

In [None]:
import os
import tensorflow as tf
import numpy as np
import scipy.io as sio
import pandas as pd
import cv2

In [None]:
# Get metaData of images
#data_dir = "D:\\Data_Warehouse\\L1636\\DataSets\\CHAR-74K\\English\\Img\\"
#names_dir = "D:\\Data_Warehouse\\L1636\\DataSets\\CHAR-74K\\English\\names\\lists.mat"
#write_location = "C:\\Users\\Virksaab\\Desktop\AI\\datasets_for_code_in_repos\\char74k_dataset\\"

data_dir = "/home/virk/Desktop/CHAR-74K/English/Img/"
names_dir = "/home/virk/Desktop/CHAR-74K/English/names/lists.mat"
write_location = "/home/virk/Desktop/AI/datasets_for_code_in_repos/char74k_dataset/"

file = sio.loadmat(names_dir)
names = pd.DataFrame(file["list"][0])
# Pre-Feats
allNames = names.ALLnames[0]
# only taking good images
#allNames = allNames[:7704]
is_good = names.is_good[0]
classNames = names.classnames[0]
total_classes = names.NUMclasses[0][0][0]
allLabels = names.ALLlabels[0]
classLabels = names.classlabels[0]
# Shuffle data before splitting
for _ in range(10):
    np.random.shuffle(allNames)

# Train, Validate and Test split ratio
train_eg = int(allNames.shape[0] * .70) - 2
validate_eg = int(allNames.shape[0] * .85)
test_eg = allNames.shape[0]
total_classes
#print(allNames[:20])

In [None]:
# TRAINING SET
image_data_train = {"images":[], "labels":[]}
for imgName, label in zip(allNames[:train_eg], allLabels[:train_eg]):
    labels_from_names = int(imgName.split("/")[3][3:6]) - 1
    img = cv2.imread(data_dir+imgName+".png")
    height, width = img.shape[:2]
    img = cv2.resize(img, ((width//width)*64, (height//height)*64), interpolation = cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image_data_train["images"].append(img)
    image_data_train["labels"].append(labels_from_names)
print("Examples in train set =", len(image_data_train['images']))
print('single image dimensions =', image_data_train['images'][0].shape)

In [None]:
# VALIDATE SET
image_data_validate = {"images":[], "labels":[]}
for imgName, label in zip(allNames[train_eg:validate_eg], allLabels[train_eg:validate_eg]):
    labels_from_names = int(imgName.split("/")[3][3:6]) - 1
    img = cv2.imread(data_dir+imgName+".png")
    height, width = img.shape[:2]
    img = cv2.resize(img, ((width//width)*64, (height//height)*64), interpolation = cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image_data_validate["images"].append(img)
    image_data_validate["labels"].append(labels_from_names)
print("Examples in validate set =", len(image_data_validate['images']))
print('single image dimensions =', image_data_validate['images'][0].shape)

In [None]:
# TEST SET
image_data_test = {"images":[], "labels":[]}
for imgName, label in zip(allNames[validate_eg:], allLabels[validate_eg:]):
    labels_from_names = int(imgName.split("/")[3][3:6]) - 1
    img = cv2.imread(data_dir+imgName+".png")
    height, width = img.shape[:2]
    img = cv2.resize(img, ((width//width)*64, (height//height)*64), interpolation = cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image_data_test["images"].append(img)
    image_data_test["labels"].append(labels_from_names)
print("Examples in test set =", len(image_data_test['images']))
print('single image dimensions =', image_data_test['images'][0].shape)

In [None]:
# Check dimensions
len(image_data_train["images"])+len(image_data_validate["images"])+len(image_data_test["images"]) == len(allNames) == len(image_data_train["labels"])+len(image_data_validate["labels"])+len(image_data_test["labels"])

In [None]:
# Convert data to TFRecords

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def convert_to(dataset, name):
    """Converts a dataset to TFRecords"""
    images = dataset["images"]
    labels = dataset["labels"]
    
    if len(images) != len(labels):
        raise ValueError('number of images is not equal to the labels provided')
    
    num_examples = int(len(labels))
    rows = images[0].shape[0]
    cols = images[0].shape[1]
    #depth = images[0].shape[2]
        
    filename = os.path.join(write_location+name+'.tfrecords')
    print("writing",filename)
    writer = tf.python_io.TFRecordWriter(filename)
    
    for index in range(num_examples):
        image_raw = images[index].tostring()
        example = tf.train.Example(features=tf.train.Features(feature={
            'height': _int64_feature(rows),
            'width': _int64_feature(cols),
            #'depth': _int64_feature(depth),
            'label': _int64_feature(int(labels[index])),
            'image_raw': _bytes_feature(image_raw)}))
        writer.write(example.SerializeToString())
    writer.close()
    

convert_to(image_data_train, 'train')
convert_to(image_data_validate, 'validate')
convert_to(image_data_test, 'test')
print("DONE!")

Copyright © 2017, Jitender Singh Virk

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.