#Imports

In [1]:
from google.colab import drive
import shutil
import os
!pip install patool
import patoolib
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
!pip install split-folders
import splitfolders
import random

Collecting patool
  Downloading patool-1.12-py2.py3-none-any.whl (77 kB)
[?25l[K     |████▎                           | 10 kB 16.8 MB/s eta 0:00:01[K     |████████▌                       | 20 kB 21.2 MB/s eta 0:00:01[K     |████████████▊                   | 30 kB 23.3 MB/s eta 0:00:01[K     |█████████████████               | 40 kB 18.0 MB/s eta 0:00:01[K     |█████████████████████▏          | 51 kB 18.2 MB/s eta 0:00:01[K     |█████████████████████████▍      | 61 kB 15.5 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71 kB 14.8 MB/s eta 0:00:01[K     |████████████████████████████████| 77 kB 3.2 MB/s 
[?25hInstalling collected packages: patool
Successfully installed patool-1.12
Collecting split-folders
  Downloading split_folders-0.4.3-py3-none-any.whl (7.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.4.3


In [2]:
class Data_Setup():
  def __init__(self, train=0.7, test=0.1, val=0.2, painters=["Mondriaan", "Rubens","Rembrandt","Picasso"], resize_mode="none", sampling_method="none"):
    self.selected_painters = painters
    self.g_drive_path, self.g_painters, self.g_painters_npaintings, self.g_n_paintings, g_shape = self.__initialise_global_variables()
    self.__mount()
    self.__fetch_data_from_mount()
    self.set_data_ratio(train, test, val)
    self.__enumerate_paintings()
    if resize_mode is "auto":
      self.auto_resize_images()
    if resize_mode is "none":
      print("\nNo resize done.\n")
    self.g_samples_sizes = self.__prepare_folders()
    if sampling_method is "undersampling":
      self.undersample_training_data()
    elif sampling_method is "oversampling":
      self.oversample_training_data()
    if sampling_method is "none":
      print("\nNo sampling method selected, samples might not be balanced.\n")

  def __initialise_global_variables(self):
    g_drive_path = "gdrive/MyDrive/Peter Vantomme - DEEP LEARNING PAINTINGS/"
    g_painters = ["Mondriaan","Rubens","Rembrandt","Picasso"]
    g_painters = self.selected_painters
    g_painters_npaintings = {}
    g_n_paintings = 0
    g_shape = []
    return (g_drive_path, g_painters, g_painters_npaintings, g_n_paintings, g_shape)

  def __mount(self):
    drive.mount("/content/gdrive")

  def __fetch_data_from_mount(self):
    try:
      os.mkdir("Input")  
      for painter in self.g_painters:
        shutil.copyfile(self.g_drive_path+painter+".rar", painter+".rar")
        patoolib.extract_archive(painter+".rar", outdir="Input")
        os.remove(painter+".rar")
      shutil.rmtree("sample_data")
    except FileExistsError:
      print("Input folder already exists, continuing...")

  
  def __enumerate_paintings(self):
    os.chdir("/content")
    enumerated = False
    if enumerated == False:
      os.chdir("Input")
      for painter in self.g_painters:
        counter = 0
        self.g_painters_npaintings[painter] = len([name for name in os.listdir(painter)])
        for filename in os.listdir(painter):
          os.chdir(painter)
          # checking if it is a file
          if os.path.isfile(filename):
            os.rename(filename,str(counter)+".jpg")
          os.chdir(os.pardir)
          counter += 1
        enumerated=True
      os.chdir(os.pardir)
      
    self.g_n_paintings =  np.sum(np.array(list(self.g_painters_npaintings.values())))

  #Maakt de directory klaar met traint, test en validatieset.
  def __prepare_folders(self):
    #Dataset directory aanmaken
    #Dataset -> [global_painters]
    if os.path.isdir("Datasets"):
      shutil.rmtree("Datasets")

    os.mkdir("Datasets")
    dict_painter_samplesizes = {}
    #Vullen van data folders
    os.chdir("Input")
    for painter in self.g_painters:
      os.chdir(painter)
      #Sample sizes
      total_paintings = len([name for name in os.listdir()])
      sample_size = {"Train":int(np.round(total_paintings/sum(self.g_data_ratio.values())*self.g_data_ratio["train"],0)),
                    "Test":int(np.round(total_paintings/sum(self.g_data_ratio.values())*self.g_data_ratio["test"],0))}
      sample_size["Val"] = total_paintings-sum(sample_size.values())
      print("---- "+painter+" ----")
      print(sample_size)
      dict_painter_samplesizes[painter] = sample_size
      os.chdir(os.pardir)
    os.chdir(os.pardir)
    #Afbeeldingen nemen volgens sizes
    splitfolders.ratio("Input", output="Datasets", seed=1337, ratio=(self.g_data_ratio["train"],self.g_data_ratio["test"],self.g_data_ratio["val"])) 
    return dict_painter_samplesizes

  #Deze helper classes zorgen ervoor dat de get_shape methode de get_shape_avg methode werkt.
  def __helper_calculate_sizes(self,painter):
    os.chdir("Input")
    os.chdir(painter)
    print(painter)
    dict_img_size = {}
    for filename in os.listdir():
      try:
        img = Image.open(filename)
        width, height = img.size
        dict_img_size[filename] = [width, height]
      except IsADirectoryError:
        print(filename+" is not an Image, removed...")
        shutil.rmtree(filename)
    os.chdir(os.pardir)
    os.chdir(os.pardir)
    return dict_img_size

  def __helper_calculate_avg_img_size(self,dict_img_size):
    widths = []
    heights = []
    for entry in dict_img_size:
      widths.append(dict_img_size[entry][0])
      heights.append(dict_img_size[entry][1])
    avg_width = np.average(widths)
    avg_height = np.average(heights)
    return [avg_width, avg_height]

  #Public classes voor de user
  def calculate_shape_avg(self):
    width = int(np.sum(np.array([self.__helper_calculate_avg_img_size(self.__helper_calculate_sizes(p))[0] for p in self.g_painters])/len(self.g_painters)))
    height = int(np.sum(np.array([self.__helper_calculate_avg_img_size(self.__helper_calculate_sizes(p))[1] for p in self.g_painters])/len(self.g_painters)))
    self.g_shape = [width,height]
  
  def get_shape_avg(self):
    try:
      return self.g_shape
    except AttributeError:
      self.calculate_shape_avg()
      return self.g_shape

  def auto_resize_images(self):
    self.calculate_shape_avg()
    os.chdir("Input")
    for painter in self.g_painters:
      os.chdir(painter)
      for filename in os.listdir():
        #Resize
        try:
          img = Image.open(filename)
          img = img.resize((self.get_shape_avg()[0], self.get_shape_avg()[1]))
          img = img.convert("RGB")
          img.save(filename)
        except IsADirectoryError:
          print(filename+" is not an Image, skipped...")
      os.chdir(os.pardir)
    os.chdir(os.pardir)

  def undersample_training_data(self):
    trainingsize = np.min([self.g_samples_sizes[painter]["Train"] for painter in self.g_painters])

    os.chdir("Datasets")
    os.chdir("train")
    for painter in self.g_painters:
       while len(os.listdir(painter)) >= trainingsize:
        try:
          os.chdir(painter)
          os.remove(random.choice(os.listdir(os.getcwd())))
          os.chdir(os.pardir)
        except FileNotFoundError:
          continue
    os.chdir(os.pardir)
    os.chdir(os.pardir)

  def oversample_training_data(self):
    trainingsize = np.max([self.g_samples_sizes[painter]["Train"] for painter in self.g_painters])

    os.chdir("Datasets")
    os.chdir("train")
    for painter in self.g_painters:
      os.chdir(painter)
      counter = len(os.listdir(os.getcwd()))+1
      while len(os.listdir(os.getcwd())) < trainingsize:
        try:
          shutil.copyfile(random.choice(os.listdir(os.getcwd())),os.getcwd()+"/"+str(counter))
          counter+=1
        except FileNotFoundError:
          continue
      os.chdir(os.pardir)
    os.chdir(os.pardir)
    os.chdir(os.pardir)

  def set_drive_path(self, path):
    self.g_drive_path = path

  def set_data_ratio(self, train=0.7, test=0.1, val=0.2):
    self.g_data_ratio = {"train":train,"test":test,"val":val} #Ratio van de data
  
  def add_painter(self, name):
    self.g_painters.append(name)

  def remove_painter(self, name):
    self.g_painers.remove(name)

  def get_data(self,artist,set_type):
    os.chdir("Datasets")
    try:
      os.chdir(set_type)
    except FileNotFoundError:
      print("Input parameters wrong. Make sure set_type is either train, val or test.")
      os.chdir(os.pardir)
    try:
      os.chdir(artist)
    except FileNotFoundError:
      print("Input parameters wrong. Make sure the artist's name is spelled correctly.")
      os.chdir(os.pardir)
      os.chdir(os.pardir)

    return list(os.listdir())
  
  def get_painters(self):
    return self.g_painters
  
  def get_drive_path(self):
    return self.g_drive_path
  
  def get_amount_of_painting_per_painter(self):
    return self.g_painters_npaintings

  def get_amount_of_paintings(self):
    return self.g_n_paintings

  def get_sample_sizes(self):
    return self.g_sample_sizes

In [3]:
#ds = Data_Setup(painters=["Picasso","Mondriaan"], resize_mode="none")

Mounted at /content/gdrive
patool: Extracting Picasso.rar ...
patool: running /usr/bin/unrar x -- /content/Picasso.rar
patool:     with cwd='Input'
patool: ... Picasso.rar extracted to `Input'.
patool: Extracting Mondriaan.rar ...
patool: running /usr/bin/unrar x -- /content/Mondriaan.rar
patool:     with cwd='Input'
patool: ... Mondriaan.rar extracted to `Input'.

No resize done.

---- Picasso ----
{'Train': 1070, 'Test': 153, 'Val': 306}
---- Mondriaan ----
{'Train': 231, 'Test': 33, 'Val': 66}


Copying files: 1859 files [00:00, 1881.77 files/s]


No sampling method selected, samples might not be balanced.






In [4]:
#ds.oversample_training_data()

In [5]:
#os.chdir("Datasets/train/Mondriaan")
#print(len(os.listdir(".")))
#os.chdir("/content")
#os.chdir("Datasets/train/Picasso")
#print(len(os.listdir(".")))
#os.chdir("/content")

1070
1070
