### Summary of Notebook
- We build a utility class to read the input images 
- We convert them to a structured format for further processing and model building
- We pickle the created input data files


In [None]:
# Import Genereal libs  libraries

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import sys
import numpy as np
import seaborn as sns
import statistics as stats
sns.set(color_codes=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#Last import allows multiple outputs from one cell
import warnings
# Initialize the random number generator
import random
random.seed(101)

In [None]:
# Import project specific libs
import tarfile
import os

In [None]:
# Useful Configuration/Setting

# suppress display of warnings
warnings.filterwarnings('ignore')

# display all dataframe columns
pd.options.display.max_columns = None

# to set the limit to 3 decimals
pd.options.display.float_format = '{:.7f}'.format

# display all dataframe rows
pd.options.display.max_rows = None

#Setting to shows all entries in array displayed
np.set_printoptions(threshold=sys.maxsize)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls

drive  sample_data


In [None]:
import xml.etree.ElementTree as et

In [None]:
class Read_Struct_Input:
  #This Utility class reads input data, structures it  and makes it available for downstream programs
  #Define our functions

  #Constructor func
  def __init__(self):
    self.image_path="/content/drive/MyDrive/Object_Localisation/Data_Set"
    self.annot_dataframe=pd.DataFrame(columns=['image_name','width','height','depth','objname','xmin','ymin','xmax','ymax'])
    self.label_dataframe = None


  def readAnnotFile(self,filepath):
    #Annotations file is in the form .tar.gz
    #Uncompress file & read file
    annotFile= tarfile.open(filepath)
    destPathIndex=filepath.rindex('/', 0 ,)
    destPathBase=filepath[0:destPathIndex]
    #print(destPathBase)
    if(not os.path.exists(destPathBase+'/annotations/')):
     annotFile.extractall(destPathBase)
     annotFile.close()

    # Loop over files & write into a dataframe
    #print(destPathBase +'/annotations'+ '/'+'xmls/')
    destPath=destPathBase +'/annotations'+ '/'+'xmls/'
    rowId=0
    for eachfile in os.listdir(destPath):

      #print(eachfile)
      #each file is an xml file which we need to parse
      tree = et.parse(destPath+'/'+eachfile)
      root = tree.getroot()
      sizeTag=root.find('size')
      image_name=eachfile[0:eachfile.rfind('.')]
      width=sizeTag.find('width').text
      height=sizeTag.find('height').text
      depth=sizeTag.find('depth').text
      objTag=root.find('object')
      bndboxTag=objTag.find('bndbox')
      objName=objTag.find('name').text
      xmin=bndboxTag.find('xmin').text
      ymin=bndboxTag.find('ymin').text
      xmax=bndboxTag.find('xmax').text
      ymax=bndboxTag.find('ymax').text
      record=[image_name,width,height,depth,objName,xmin,ymin,xmax,ymax]
      # Append record in dataframe
      self.annot_dataframe.loc[rowId]=record
      rowId=rowId+1

      # Quick check if any xml file has more than one bounding box
      bndboxTagCheck=objTag.findall('bndbox')
      if (len(bndboxTagCheck) > 1):
        print("more than one found for record",image_name )
      
  
      
    return

  def readLabelFile(self,filepath):
    self.label_dataframe=pd.read_csv(filepath,sep=' ',header=None,names=['image_name','id','species','breed'],index_col=False )
    #note to self : index parameter needed as otherwise it took image name as index
    return

  def createInpDataFrame(self):
    return
  
  def readVideo(self):
    return
  
  def readImages(self):
    return


In [None]:
if __name__ == "__main__":
  objReadInput= Read_Struct_Input()
  objReadInput.readAnnotFile('/content/drive/MyDrive/Object_Localisation/Data_Set/annotations.tar.gz')
  objReadInput.readLabelFile('/content/drive/MyDrive/Object_Localisation/Data_Set/annotations/list.txt')

In [None]:
#objReadInput.annot_dataframe.dtypes
objReadInput.annot_dataframe.shape
#objReadInput.annot_dataframe.sample(5)
#objReadInput.annot_dataframe.isna().sum().sum()
#objReadInput.annot_dataframe.duplicated().sum()

(3686, 9)

In [None]:
#objReadInput.label_dataframe.dtypes
objReadInput.label_dataframe.shape
#objReadInput.label_dataframe.sample(5)
#objReadInput.label_dataframe.isna().sum()

(7355, 4)

In [None]:
# Our i/p data frames are ready and structured
# Lets combine into one df & pickle for further use

ip_dataframe= objReadInput.annot_dataframe.merge(objReadInput.label_dataframe,on='image_name',how='inner')
#Note to self : join did not work here, gave me an error saying trying to join object with int
#Both are objects so not sure why ths error
#Switched to merge instead which worked

In [None]:
ip_dataframe.sample(5)
ip_dataframe.shape

Unnamed: 0,image_name,width,height,depth,objname,xmin,ymin,xmax,ymax,id,species,breed
1270,english_cocker_spaniel_150,402,500,3,dog,64,38,304,245,13,2,7
2624,american_bulldog_148,500,375,3,dog,136,123,347,275,2,2,1
1484,japanese_chin_168,334,500,3,dog,71,102,325,287,18,2,12
3657,Persian_20,500,375,3,cat,206,114,342,251,24,1,8
584,scottish_terrier_141,375,500,3,dog,60,68,294,320,31,2,21


(3671, 12)

In [None]:
# Checking why no of records in merged is < no of records in annotations
x=(objReadInput.annot_dataframe['image_name']).tolist()
y=ip_dataframe['image_name'].tolist()
list(set(x)-set(y))
# Some records present in annotations is missing in label list and hence the diffeence

['Bombay_189',
 'newfoundland_155',
 'newfoundland_152',
 'english_cocker_spaniel_163',
 'english_cocker_spaniel_179',
 'Siamese_203',
 'Egyptian_Mau_183',
 'Egyptian_Mau_129',
 'Bombay_190',
 'english_cocker_spaniel_162',
 'english_cocker_spaniel_164',
 'Bombay_11',
 'newfoundland_154',
 'Bombay_192',
 'newfoundland_153']

In [None]:
# Lets pickle are i/p dataframe 
# Pickle Column index for use in future predictions file
import pickle
with open('/content/drive/MyDrive/Object_Localisation/Pickled_Data/IpDataFrame.pickle', 'wb') as f:
    pickle.dump(ip_dataframe, f)
