In [39]:
import random
import os
import requests
from IPython.display import clear_output
import glob
import shutil

def getLinks(filepath,datasetname):
    links = []
    with open(filepath) as file:
        content = file.read()
        links = content.splitlines()
    downloadedpaths = glob.glob(datasetname+'/**/*.jpg', recursive=True)
    downloadedFilenames = [filename.split('/')[-1] for filename in downloadedpaths]
    output = []
    for link in links:
        if not getlinkparams(link)['dl'] in downloadedFilenames:
            output.append(link)
    return output

def splitLinks(links,split,shuffle = False):
    if shuffle: 
        random.shuffle(links)
    train = links[:int(len(links)*split)]
    validation = links[-int(len(links)*(1.0-split)):]
    return train,validation

def getlinkparams(link):
    #print(link)
    l = link.split('?')[1].split('&')
    di = {}
    for item in l:
        item_l = item.split('=')
        di[item_l[0]] = item_l[1]
    return di

def removeDuplicates(filepath):
    links = []
    with open(filepath) as file:
        content = file.read()
        links = content.splitlines()
    print("Original:",len(links))
    newlist = list(set(links))
    with open(filepath, 'w') as file:
        file.writelines([str(line) + "\n" for line in newlist])
    print("New:",len(newlist))
    
def downloadLinks(links,folderpath):
    ln = len(links)
    failedDownloads = []
    for i,url in enumerate(links):
        clear_output(wait=True)
        filename = getlinkparams(url)['dl']
        path = folderpath+'/'+filename
        if not os.path.exists(path):
            response = requests.get(url)
            if response.status_code == 200:
                print('[ '+str(int( ((i+1)/ln)*100 ) )+'% ] '+ filename + ' Downloaded')
                # Save the file to a specific folder
                with open(path, 'wb') as file:
                    file.write(response.content)
            else:
                print(filename + 'Download failed')
                failedDownloads.append(link)
        else:
            print('[ '+str(int( ((i+1)/ln)*100 ) )+'% ] skipping '+ filename)
            
def buildDataset(dataset_name,train_links,validation_links,classes):
    train_path = dataset_name+"/train"
    validation_path = dataset_name+"/validation"
    os.makedirs(train_path+"/images", exist_ok=True)
    os.makedirs(validation_path+"/images", exist_ok=True)
    os.makedirs(train_path+"/labels", exist_ok=True)
    os.makedirs(validation_path+"/labels", exist_ok=True)
    with open(dataset_name+"/classes.txt", 'w') as f:
        f.writelines([f"{line}\n" for line in classes])
    downloadLinks(train_links,train_path+"/images")
    downloadLinks(validation_links,validation_path+"/images")

def fixDataset(datasetname):
    labelpaths = glob.glob(datasetname+'/**/*.txt', recursive=True)
    imagepaths = glob.glob(datasetname+'/**/*.jpg', recursive=True)
    labelfilenames = []
    pairs = []
    wrong_directory = []
    withoutlabel = []
    duplicates = []
    imagefilenames = []
    
    for labelpath in labelpaths:
        labelfilenames.append(labelpath.split('/')[-1][:-3])
        
    for imagepath in imagepaths:
        imagesplit = imagepath.split('/')
        if imagesplit[-1] in imagefilenames:
            duplicates.append(imagepath)
            print("found duplicate: ",imagepath)
        imagefilenames.append(imagesplit[-1])

    if len(duplicates) > 0:
        print("found ",len(duplicates)," duplicates")
        print("fixing")
        
        for duplicate in duplicates:
            os.remove(duplicate)
        
    for imagepath in imagepaths:
        imagesplit = imagepath.split('/')
        labelfound = False
        if imagesplit[-1][:-3] in labelfilenames:
            for labelpath in labelpaths:
                labelsplit = labelpath.split('/')
                if labelsplit[-1][:-3] == imagesplit[-1][:-3]:
                    labelfound = True
                    #print((imagepath,labelpath))
                    pairs.append((imagepath,labelpath))
                    break
        if not labelfound:
            withoutlabel.append(imagepath)
    print(len(pairs), " pairs found for ",len(imagepaths)," images")
    print(len(withoutlabel), " images without label")
    for imagepath in withoutlabel:
        print(imagepath)
    for pair in pairs:
        if pair[0].split("/")[1] != pair[1].split("/")[1]:
            wrong_directory.append(pair)
            #print(pair[0].split("/")[1],pair[1].split("/")[1])
    if len(wrong_directory) > 0:
        print(len(wrong_directory), "labels in the wrong directory")
        print("fixing")
        for mismachedpair in wrong_directory:
            splt = mismachedpair[1].split('/')
            correctfolder = mismachedpair[0].split('/')[1] 
            #print("0:"+mismachedpair[0])
            #print("1:"+mismachedpair[1])
            #print("2:"+splt[0]+"/"+correctfolder+"/"+"/".join(splt[2:]) )
            os.rename(mismachedpair[1], splt[0]+"/"+correctfolder+"/"+"/".join(splt[2:]) )
    #return pairs
            
def shuffleFile(filepath):
    links = getLinks(filepath)
    random.shuffle(links)
    with open(filepath, 'w') as file:
        file.writelines([str(line) + "\n" for line in links])


In [187]:
shuffleFile("licenseplates_dataset_links.txt")

In [188]:
shuffleFile("faces_dataset_links.txt")

In [26]:
removeDuplicates("licenseplates_dataset_links.txt")

Original: 205
New: 202


In [5]:
removeDuplicates("faces_dataset_links.txt")

Original: 152
New: 152


In [53]:
Faces_Dataset_links = getLinks("faces_dataset_links.txt","faces_dataset")
Faces_Train,Faces_Validation = splitLinks(Faces_Dataset_links,0.8,shuffle=True)
buildDataset("faces_dataset",Faces_Train,Faces_Validation,['face'])

[ 100% ] pexels-roneferreira-2735037.jpgDownloaded


In [29]:
Licenseplates_links = getLinks("licenseplates_dataset_links.txt","licenseplates_dataset")
licenseplates_Train,licenseplates_Validation = splitLinks(Licenseplates_links,0.8,shuffle=True)
buildDataset("licenseplates_dataset",licenseplates_Train,licenseplates_Validation,['license plate'])

[ 100% ] downloading pexels-jibarofoto-2038781.jpg


In [37]:
fixDataset("licenseplates_dataset")

201  pairs found for  201  images
0  images without label


In [42]:
fixDataset("faces_dataset")

207  pairs found for  207  images
0  images without label
