Image Preprocessing

In [None]:
import cv2
import imghdr
import matplotlib.pyplot as plt

In [None]:
data_dir = 'data/'

In [None]:
def split_dir_to_train_test_val(directory = data_dir,
                            train_size = 0.7,
                            test_size = 0.2,
                            val_size = 0.1):
  """
  Creates 3 folders for Train, Test and Validation data
  """
  import os
  import random
  import shutil

  # Set random seed
  rng = random.Random(42)

  for root, folders, files in os.walk(directory):
    for folder in folders:
      # Create list of the files
      list_of_files = []
      for file_name in os.listdir(root+folder+"/"):
        list_of_files.append(file_name)

      #  Shuffle the list
      rng.shuffle(list_of_files)

      # Create lists of files
      train_files = list_of_files[:int(len(list_of_files)*train_size)]
      test_files = list_of_files[int(len(list_of_files)*train_size) : int(len(list_of_files)*(train_size+test_size))]
      val_files = list_of_files[int(len(list_of_files)*(train_size+test_size)):]

      # Create folders and files for train data
      for one_file in train_files:

        # Copy  files
        dest_dir = "files/train/"+folder+"/"
        os.makedirs(dest_dir, exist_ok=True)

        shutil.copy2(src=(root+folder+"/"+one_file),
                    dst=(dest_dir+one_file))
      print(f"Folder {folder}. Train data copied. {len(train_files)} files")

      # Create folders and files for test data
      for one_file in test_files:
        # Copy  files
        dest_dir = "files/test/"+folder+"/"
        os.makedirs(dest_dir, exist_ok=True)

        shutil.copy2(src=(root+folder+"/"+one_file),
                    dst=(dest_dir+one_file))
      print(f"Folder {folder}. Test data copied. {len(test_files)} files")

      # Create folders and files for validation data
      for one_file in val_files:

        # Copy  files
        dest_dir = "files/validation/"+folder+"/"
        os.makedirs(dest_dir, exist_ok=True)

        shutil.copy2(src=(root+folder+"/"+one_file),
                    dst=(dest_dir+one_file))
      print(f"Folder {folder}. Validation data copied. {len(val_files)} files")




def get_class_names_from_folder(directory):
  """
  Get the classnames from train folder for example
  """
  import pathlib
  import numpy as np
  data_dir = pathlib.Path(directory)
  class_names = np.array(sorted([item.name for item in data_dir.glob("*")])) # Created a list of class names
  return class_names
  print(class_names)

In [16]:
split_dir_to_train_test_val(directory=data_dir,
                            train_size=0.7,
                            test_size=0.2,
                            val_size=0.1)

Folder 707-320. Train data copied. 70 files
Folder 707-320. Test data copied. 19 files
Folder 707-320. Validation data copied. 11 files
Folder 727-200. Train data copied. 70 files
Folder 727-200. Test data copied. 19 files
Folder 727-200. Validation data copied. 11 files
Folder 737-200. Train data copied. 70 files
Folder 737-200. Test data copied. 19 files
Folder 737-200. Validation data copied. 11 files
Folder 737-300. Train data copied. 70 files
Folder 737-300. Test data copied. 19 files
Folder 737-300. Validation data copied. 11 files
Folder 737-400. Train data copied. 70 files
Folder 737-400. Test data copied. 19 files
Folder 737-400. Validation data copied. 11 files
Folder 737-500. Train data copied. 70 files
Folder 737-500. Test data copied. 19 files
Folder 737-500. Validation data copied. 11 files
Folder 737-600. Train data copied. 70 files
Folder 737-600. Test data copied. 19 files
Folder 737-600. Validation data copied. 11 files
Folder 737-700. Train data copied. 70 files
Fold

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'files/train/A319/1879116.jpg'

In [None]:
class_names = get_class_names_from_folder(directory="files/train/")
class_names

In [None]:
len(class_names)

Data Parsing

In [None]:
# Making CSV in format image_name, class, label where label is the index of the class
import pandas as pd
import os

def make_csv_from_folder(directory, class_names, name):
    data_dir = os.path.join(directory)
    data = []
    for root, folders, files in os.walk(data_dir):
        for folder in folders:
            for file in os.listdir(root+folder+"/"):
                data.append([file, folder, class_names.tolist().index(folder)])
    df = pd.DataFrame(data, columns=["image_name", "class", "label"])
    df.to_csv("{directory}{folder_name}_data.csv".format(directory="files/", folder_name=name), index=False)
    print("CSV created at {directory}{folder_name}_data.csv".format(directory="files/", folder_name=name))

make_csv_from_folder(directory="files/train/", class_names=class_names, name="train")
make_csv_from_folder(directory="files/test/", class_names=class_names, name="test")
make_csv_from_folder(directory="files/validation/", class_names=class_names, name="validation")

In [None]:
def zip_files(directory):
  """
  Zips the files
  """
  import shutil
  shutil.make_archive(directory, 'zip', directory)

zip_files("files")

In [None]:
# Delete files directory
import shutil
shutil.rmtree("files")