# Reading image files from disk and storing into numpy array

## Comparison between using _OpenCV_, _Matplotlib_, _HDF5_ and _LMDB_

In [1]:
import csv
import time
import numpy as np
import timeit

In [2]:
csv_file_path = 'MNIST_images/info.csv' 
timeit_number = 100

file_label_list = list()

with open(csv_file_path) as csvFile: #open the csv file
    
    reader = csv.reader(csvFile)    
    line_count = 0
    
    for row in reader: 
        
        if line_count == 0: #first row
            
            line_count += 1
            print("Column names are %s and %s "%(str(row[0]), str(row[1])))
            
        else:
            
            file_label_list.append(row) #store the path and label in the 'file_label_list' as lists
            line_count += 1 
            
    total_image_files = line_count -1 #-1 for the head   
    print("A total of %d lines were inserted in the list"%(total_image_files)) 
    


Column names are File_path and Label 
A total of 60000 lines were inserted in the list


### Using _OpenCV_

In [3]:
import cv2

In [4]:
def read_cv2(filepath):
    '''Reads an image using cv2 in grayscale mode.
       Returns the image in np array.
       Parameter
       ---------
       filepath : an image file's path
    '''
    return cv2.imread(filepath, 0) #read the image in grayscale

In [5]:
#function to read the images using opencv
def run_cv2_code():
    
    np_arr = np.empty([total_image_files, 28, 28]) #create empty array of shape (60000,28,28) 
    
    for i in range(total_image_files):
        np_arr[i] = read_cv2(file_label_list[i][0]) #assign the numpy array of specific index with the image array

In [6]:
time_cv2 = timeit.timeit(run_cv2_code,
                     number = timeit_number) #using timeit to run the function and get the total time to run it 'number' times

In [7]:
print("The average time it takes to read the images using OpenCV is : %g"%(time_cv2/timeit_number))

The average time it takes to read the images using OpenCV is : 1.97192


### Using _Matplotlib_

In [8]:
import matplotlib.pyplot as plt

In [9]:
def read_plt(filepath):
    '''Reads an image using matplotlib.
       Returns the image in np array format.
       Parameter
       ---------
       flepath : an image file's path
    '''
    return plt.imread(filepath)

In [10]:
def run_plt_code():
    
    np_arr = np.empty([total_image_files, 28, 28]) #create empty array of shape (60000,28,28)
    
    for i in range(total_image_files):
        np_arr[i] = read_plt(file_label_list[i][0]) #assign the numpy array of specific index with the image array

In [11]:
time_plt = timeit.timeit(run_plt_code,
                     number = timeit_number) #using timeit to run the function and get the total time to run it 'number' times

In [12]:
print("The average time it takes to read the images using Matplotlib is : %g"%(time_plt/timeit_number))

The average time it takes to read the images using Matplotlib is : 7.3618


### Using HDF5

In [None]:
import h