In [1]:
# Imports
import os
import sys
import pandas as pd
import shutil
from pathlib import Path

# OPTIONS:
folder_name="project"

In [2]:
label_dir = os.path.join(folder_name,"labels/")
directory = os.listdir(label_dir)
no_none=1
for file in directory:
    if Path(os.path.join(label_dir,file)).stat().st_size < 5:
        no_none=0

In [3]:
def checkdata(columns,classes_dir,*args):
# Managing Args:
    if len(args)==1:
        # Data directory is given to function:
        img_dir = os.path.join(args[0],"images/")
        label_dir = os.path.join(args[0],"labels/")
    elif len(args)==2:
        # Image dir and Label dir is given to function:
        img_dir=args[0]
        label_dir=args[1]
    else:
        sys.exit("Unsupported number of args passed to checkdata(),")
        
    # Setting up classes:
    f=open(classes_dir)
    count=0
    classkey={}
    classes={}
    for line in f:
        line=line[:-1]
        classkey[line]=[]
        classes[count]=line
        count+=1
    if no_none==0:
        classkey["None"]=[]
        classes[count]="None"

# Setting up list of Images and Labels:
    name_of_images=[]
    name_of_labels=[]
    directory = os.listdir(img_dir)
    for file in directory:
        if file.endswith((".png","jpg","jpeg","tif")):
            name_of_images.append(file)

    directory = os.listdir(label_dir)
    for file in directory:
        if file.endswith((".txt",".csv")):
            name_of_labels.append(file)

# Removing images without labels:
    root_names=[]
    count=0
    for filename in name_of_images:
        if filename.endswith((".png",".jpg",".tif")):
            root_names.append(filename[:-4])       
        elif filename.endswith((".jpeg",".tiff")):
            root_names.append(filename[:-5])

    for filename in root_names:
        if (filename+".txt") not in name_of_labels:
            os.remove(img_dir+filename+".jpg")
            count+=1
    print("{} images deleted".format(count))

# Removing extra labels:
    root_names=[]
    count=0
    for filename in name_of_labels:
        if filename.endswith((".txt")):
            root_names.append(filename[:-4])

    for filename in root_names:
        if (filename+".jpg") not in name_of_images:
            os.remove(label_dir+filename+".txt")
            count+=1
    print("{} labels deleted".format(count))
    
# Setting up list of Images and Labels again:
    name_of_images=[]
    name_of_labels=[]
    directory = os.listdir(img_dir)
    for file in directory:
        if file.endswith((".png","jpg","jpeg","tif")):
            name_of_images.append(file)

    directory = os.listdir(label_dir)
    for file in directory:
        if file.endswith((".txt",".csv")):
            name_of_labels.append(file)

# Read the labels:
    dfdict={}
    for filename in name_of_labels:
        label_loc=os.path.join(label_dir,filename)
        if label_loc.endswith((".csv")):
            df = pd.read_csv(label_loc, header = 0, delimiter=",", names=columns)
        else:
            df = pd.read_csv(label_loc, header = None, delimiter=" ", names=columns)
        df['Class'] = df['Class'].map(classes)
        dfdict[filename]=df

# Count the classes:
    for key,value in dfdict.items():
        if len(value)==0:
            classkey["None"].append(key[:-4])
        for index,row in value.iterrows():
            for classname in classkey.keys():
                if classname in row["Class"]:
                    classkey[classname].append(key[:-4])

    for key,value in classkey.items():
        print("{}:{}".format(key,len(value)))    
    return(classkey)

In [4]:
def splitdata(classname,data,train_percent=60,print_result=False):
# Check if data exists:
    if len(data)<8:
        return(print("{} has less than 8 samples".format(classname)))
        
# Calculating the split size:
    train_split=int((train_percent/100)*len(data))
    if train_split<2:
        train_split=2

    test_percent=(100-train_percent)//2
    test_split=int((test_percent/100)*len(data))
    if test_split<1:
        test_split=1

    val_split=len(data)-test_split-train_split
    if val_split<2:
        val_split=2
        
    split_size = [test_split, train_split, val_split ]
    print (("{} : Size of each split: ".format(classname)) + str(split_size))
    
# Executing the list split:
    if test_split+train_split+val_split>len(data):
        sys.exit("Warning, Please use smaller test percent value!")
        
    split_list = [train_split, test_split+train_split, test_split+train_split+val_split ] 
    split = [data[i:j] for i, j in zip([0] + split_list, split_list)]
    
# Print result 
    if print_result==True:
        print ("The splitted lists are : " +  str(split))
    return split

## Run upto this cell to just check class counts and clean the data:

This cell cleans up extra data in datasets. All you have to do is give it a directory for images (`img_dir`) and another directory for the labels (`label_dir`). It will remove images with no labels and vice versa.

At the same time it'll count the total instances of each class in the data, as long as `classes_dir` points to `classes.txt`.

In [5]:
# Setup Source Directories:
source_dir = os.path.join(os.getcwd(),folder_name)
img_dir = os.path.join(source_dir,"images/")
label_dir = os.path.join(source_dir,"labels/")
classes_dir= os.path.join(source_dir,"classes.txt")

# Setting Destination directories:
data_dir=os.path.join(os.getcwd(),"data")
train_dir=os.path.join(data_dir,"train/")
test_dir=os.path.join(data_dir,"test/")
val_dir=os.path.join(data_dir,"val/")

# Deleting and copying data folder:
shutil.rmtree(data_dir)
os.rename("{}_copy".format(data_dir),data_dir)
shutil.copytree(data_dir,"{}_copy".format(data_dir))

if not os.path.exists("{}_copy".format(source_dir)):
    # Copying source folder:
    shutil.copytree(source_dir,"{}_copy".format(source_dir))
    
split_dir=[train_dir,test_dir,val_dir]
columns=["Class","x","y","w","h"]

# Find variance of examples of class:
print(classes_dir)
classkey=checkdata(columns,classes_dir,img_dir,label_dir)

/home/joel/JupyterProjects/Multilabel-Stratified-Split/project/classes.txt
0 images deleted
0 labels deleted
LOB-Acord:127
Contact Type-Acord:50
Contact Name-Acord:49
Contact Phone No-Acord:76
Group Code-Acord:16
Year Founded-Acord:29
Year-PCI:131
Premium-PCI:462
Application Name-Email:26
Effective Date-Email:16
Agency Code-Acord:31
Quote Date-Acord:32
Effective Date-Acord:31
Expiration Date-Acord:31
Applicant Name-Acord:31
Applicant Address-Acord:31
SIC-Acord:31
NAICS-Acord:31
FEIN-Acord:31
Premium-Acord:30
UW/Performer-Email:0
LOB-Email:0
NAICS-Email:0
SIC-Email:0
Group Code-Email:0
Agency Code-Email:0
Premium-LOB:40
Billing Type-Acord:17
VIN:56
Premium-Email:14
Ownership Structure-Acord:19
Quote Date-Cover:0
Market Segment-Cover:0
Effective Date-Cover:15
Applicant Name-Cover:16
Market Segment-Email:0
Group Code-Cover:0
Agency Code-Cover:0
UW/Performer-Cover:0
Contact Type-LOB:0
Contact Name-LOB:0
Contact Phone No-LOB:0
Expiry Date-Email:0
LOB-Cover:0
Applicant Address-Cover:13
Expir

## Run this cell as well, if you want to split the data:

In [6]:
ordered_classes=[]
ordered_classes=sorted(classkey, key=lambda k: len(classkey[k]))
    
def movefiles(classname,filename,split_dir):
    image=filename+".jpg"
    label=filename+".txt"
    img_src_loc=os.path.join(img_dir,image)
    img_dst_loc=os.path.join(split_dir,"images/",image)
    label_src_loc=os.path.join(label_dir,label)
    label_dst_loc=os.path.join(split_dir,"labels/",label)
    shutil.move(img_src_loc, img_dst_loc)
    shutil.move(label_src_loc, label_dst_loc)
    for key in ordered_classes:
        try:
            classkey[key].remove(filename)
        except ValueError:
            continue

for classname in ordered_classes:
    splitted_class=splitdata(classname,classkey[classname],65)
    for i in range(3):
        try:
            for filename in splitted_class[i]:
                try:
                    movefiles(classname,filename,split_dir[i])
    # BUG IN DUPLICATE CREATION OF VALUES IN classkey CAUSING FILES TO BE MOVED TWICE:
                except FileNotFoundError:
                    continue
        except TypeError:
            continue

for directory in split_dir:
    print("\n{}:".format(directory.split("/")[-2].upper()))
    checkdata(columns,classes_dir,directory)

UW/Performer-Email has less than 8 samples
LOB-Email has less than 8 samples
NAICS-Email has less than 8 samples
SIC-Email has less than 8 samples
Group Code-Email has less than 8 samples
Agency Code-Email has less than 8 samples
Quote Date-Cover has less than 8 samples
Market Segment-Cover has less than 8 samples
Market Segment-Email has less than 8 samples
Group Code-Cover has less than 8 samples
Agency Code-Cover has less than 8 samples
UW/Performer-Cover has less than 8 samples
Contact Type-LOB has less than 8 samples
Contact Name-LOB has less than 8 samples
Contact Phone No-LOB has less than 8 samples
Expiry Date-Email has less than 8 samples
LOB-Cover has less than 8 samples
Expiration Date-Cover has less than 8 samples
SIC-Cover has less than 8 samples
NAICS-Cover has less than 8 samples
Applicant Address-Cover : Size of each split: [2, 8, 3]
Premium-Email : Size of each split: [2, 9, 3]
Effective Date-Cover has less than 8 samples
Group Code-Acord : Size of each split: [2, 10, 

In [7]:
# Copying source folder:
shutil.rmtree(source_dir)
os.rename("{}_copy".format(source_dir),source_dir)