In [0]:
%matplotlib inline
import os 
import numpy as np
import xml.etree.ElementTree as ET
from collections import OrderedDict
import matplotlib.pyplot as plt
import pandas as pd 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
######## Loading tar file of PASCAL VOC 2007 #########

!tar -xf "/content/drive/My Drive/VOCtrainval_06-Nov-2007.tar"
!tar -xf "/content/drive/My Drive/VOCtest_06-Nov-2007.tar"

In [0]:
####### Converting Data into Pandas DataFrame #########

dir_anno = "/content/VOCdevkit/VOC2007/Annotations"

def extract_single_xml_file(tree):
    Nobj = 0
    row  = OrderedDict()
    for elems in tree.iter():

        if elems.tag == "size":
            for elem in elems:
                row[elem.tag] = int(elem.text)
        if elems.tag == "object":
            for elem in elems:
                if elem.tag == "name":
                    row["bbx_{}_{}".format(Nobj,elem.tag)] = str(elem.text)            
                if elem.tag == "bndbox":
                    for k in elem:
                        row["bbx_{}_{}".format(Nobj,k.tag)] = float(k.text)
                    Nobj += 1
    row["Nobj"] = Nobj
    return(row)

df_anno = []
for fnm in os.listdir(dir_anno):  
    if not fnm.startswith('.'):          # do not include hidden folders/files
        tree = ET.parse(os.path.join(dir_anno,fnm))
        row = extract_single_xml_file(tree)
        row["fileID"] = fnm.split(".")[0] # + '.jpg'  
        df_anno.append(row)
df_anno = pd.DataFrame(df_anno)

maxNobj = np.max(df_anno["Nobj"])

df_anno.to_csv("/content/VOCdevkit/VOC2007/df_anno.csv")

In [0]:
####### Select the ones you want and add extra column 'partition' ########

df_anno = df_anno[['fileID','bbx_0_name']]
df_anno = df_anno.sort_values(['fileID'])
df_anno['partition'] = ''

In [0]:
####### Preparing training csv file ########

''' train = 0
    valid = 1
    test = 2 '''

classes = ('person', 'chair', 'car', 'dog', 'bottle', 'cat', 'bird', 'pottedplant', 'sheep', 'boat', 'aeroplane', 'tvmonitor', 'sofa', 'bicycle', 'horse'
,'motorbike', 'diningtable', 'cow', 'train', 'bus')

for c in range(20):

    df1 = pd.read_csv('/content/VOCdevkit/VOC2007/ImageSets/Main/{}_train.txt'.format(classes[c]), sep="\s+", dtype=str, header=None)
    df1 = df1.drop(df1.columns[[1]], axis=1)

    for i in range(df1.size):
        for j in range(9963):
            if df_anno.loc[j,'fileID'] == df1.loc[i,0]:
              df_anno.loc[j,'partition'] = 0
              break
    print(classes[c] + " --> Done!")  

df_anno.loc[df_anno['partition'] == 0].to_csv('/content/VOCdevkit/VOC2007/pascalvoc-2007-train.csv')

person --> Done!
chair --> Done!
car --> Done!
dog --> Done!
bottle --> Done!
cat --> Done!
bird --> Done!
pottedplant --> Done!
sheep --> Done!
boat --> Done!
aeroplane --> Done!
tvmonitor --> Done!
sofa --> Done!
bicycle --> Done!
horse --> Done!
motorbike --> Done!
diningtable --> Done!
cow --> Done!
train --> Done!
bus --> Done!


In [0]:
####### Preparing validation csv file ########

''' train = 0
    valid = 1
    test = 2 '''

classes = ('person', 'chair', 'car', 'dog', 'bottle', 'cat', 'bird', 'pottedplant', 'sheep', 'boat', 'aeroplane', 'tvmonitor', 'sofa', 'bicycle', 'horse'
,'motorbike', 'diningtable', 'cow', 'train', 'bus')

for c in range(20):

    df1 = pd.read_csv('/content/VOCdevkit/VOC2007/ImageSets/Main/{}_val.txt'.format(classes[c]), sep="\s+", dtype=str, header=None)
    df1 = df1.drop(df1.columns[[1]], axis=1)

    for i in range(df1.size):
        for j in range(9963):
            if df_anno.loc[j,'fileID'] == df1.loc[i,0]:
              df_anno.loc[j,'partition'] = 1
              break
    print(classes[c] + " --> Done!")         

df_anno.loc[df_anno['partition'] == 1].to_csv('/content/VOCdevkit/VOC2007/pascalvoc-2007-val.csv')    

person --> Done!
chair --> Done!
car --> Done!
dog --> Done!
bottle --> Done!
cat --> Done!
bird --> Done!
pottedplant --> Done!
sheep --> Done!
boat --> Done!
aeroplane --> Done!
tvmonitor --> Done!
sofa --> Done!
bicycle --> Done!
horse --> Done!
motorbike --> Done!
diningtable --> Done!
cow --> Done!
train --> Done!
bus --> Done!


In [0]:
####### Preparing test csv file ########

''' train = 0
    valid = 1
    test = 2 '''

classes = ('person', 'chair', 'car', 'dog', 'bottle', 'cat', 'bird', 'pottedplant', 'sheep', 'boat', 'aeroplane', 'tvmonitor', 'sofa', 'bicycle', 'horse'
,'motorbike', 'diningtable', 'cow', 'train', 'bus')

for c in range(20):

    df1 = pd.read_csv('/content/VOCdevkit/VOC2007/ImageSets/Main/{}_test.txt'.format(classes[c]), sep="\s+", dtype=str, header=None)
    df1 = df1.drop(df1.columns[[1]], axis=1)

    for i in range(df1.size):
        for j in range(9963):
            if df_anno.loc[j,'fileID'] == df1.loc[i,0]:
              df_anno.loc[j,'partition'] = 2
              break
    print(classes[c] + " --> Done!")    

df_anno.loc[df_anno['partition'] == 2].to_csv('/content/VOCdevkit/VOC2007/pascalvoc-2007-test.csv')    

person --> Done!
chair --> Done!
car --> Done!
dog --> Done!
bottle --> Done!
cat --> Done!
bird --> Done!
pottedplant --> Done!
sheep --> Done!
boat --> Done!
aeroplane --> Done!
tvmonitor --> Done!
sofa --> Done!
bicycle --> Done!
horse --> Done!
motorbike --> Done!
diningtable --> Done!
cow --> Done!
train --> Done!
bus --> Done!


In [44]:
######## final csv file for train, val and test ##########

df = pd.read_csv('/content/VOCdevkit/VOC2007/pascalvoc-2007-test.csv', dtype = str)
col = [0,3]
df = df.drop(df.columns[col], axis = 1)
print(df)


      fileID   bbx_0_name
0     000001          dog
1     000002        train
2     000003         sofa
3     000004          car
4     000006  pottedplant
...      ...          ...
4947  009956          cat
4948  009957       person
4949  009960    motorbike
4950  009962        chair
4951  009963          car

[4952 rows x 2 columns]


In [45]:
classes = ('person', 'chair', 'car', 'dog', 'bottle', 'cat', 'bird', 'pottedplant', 'sheep', 'boat', 'aeroplane', 'tvmonitor', 'sofa', 'bicycle', 'horse'
,'motorbike', 'diningtable', 'cow', 'train', 'bus')

for i in range(20):
    df.loc[df['bbx_0_name'] == classes[i], 'bbx_0_name'] = i

for i in range(2510):
    df.iloc[i]['fileID'] = df.iloc[i]['fileID']  + '.jpg'

print("After conversion \n", df.head(5))

df.to_csv("/content/VOCdevkit/VOC2007/pascalvoc-2007-test-final.csv")


After conversion 
        fileID bbx_0_name
0  000001.jpg          3
1  000002.jpg         18
2  000003.jpg         12
3  000004.jpg          2
4  000006.jpg          7
