# Dataset preparation

Once images are processed, starts the tough task which is visualize and clasify the images.
Images should be stored manually in two folders accidents and no-accidents

When images are allocated in the right folder, it is time to run the next module '2_Dataset_preparation' to:

Split the supervised images in two groups, one for train and another for valid which each contains images of accidents and no-accidents. The module splits them randomly and does data augmentation of accident images (which are fewer than no-accident). 

It splits in train (80% of supervised image set) and valid (20% of supervised image set).





<a id="env_setup"> </a>
## 1. Environment setup and library import

In [0]:
import random                           # Generates random numbers
import os                               # Create directories, list files
from shutil import copyfile             # Copy files from Source to Destination
import cv2                              # To flip images in data augmentation
import matplotlib.pyplot as plt         # To save the images
import matplotlib.image as mpimg

<a id="folders_setup"> </a>
## 2. Folder checking and creation

Checks if folder layout exist, deletes content, creates if necessary.
    - Train -- accident
            -- no_accident
    - Valid -- accident
            -- no_accident
    - tmp



In [0]:
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/train' ]; then echo "Directory train already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/train' && echo "Directory train created"; fi
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/valid' ]; then echo "Directory valid already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/valid' && echo "Directory valid created"; fi
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/tmp' ]; then echo "Directory tmp already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/tmp' && echo "Directory tmp created"; fi
print ("")
print (os.listdir("GDrive/My Drive/CarCrashDetection/Dataset/"))
print ("")
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/train/accident' ]; then echo "Directory accident in train already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/train/accident' && echo "Directory accident in train created"; fi
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/train/no_accident' ]; then echo "Directory no_accident in train already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/train/no_accident' && echo "Directory no_accident in train created"; fi
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/valid/accident' ]; then echo "Directory accident in valid already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/valid/accident' && echo "Directory accident in valid created"; fi
!if [ -d 'GDrive/My Drive/CarCrashDetection/Dataset/valid/no_accident' ]; then echo "Directory no_accident in valid already exist" ; else mkdir 'GDrive/My Drive/CarCrashDetection/Dataset/valid/no_accident' && echo "Directory no_accident in valid created"; fi
print ("")
print ("Deleting folder content train accident")
!rm GDrive/My\ Drive/CarCrashDetection/Dataset/train/accident/* > /dev/null
print ("Deleting folder content train no_accident")
!rm GDrive/My\ Drive/CarCrashDetection/Dataset/train/no_accident/* > /dev/null
print ("Deleting folder content valid accident")
!rm GDrive/My\ Drive/CarCrashDetection/Dataset/valid/accident/* > /dev/null
print ("Deleting folder content valid no_accident")
!rm GDrive/My\ Drive/CarCrashDetection/Dataset/valid/no_accident/* > /dev/null

```
./dataset    
│
└─── models
│
└─── train      (80% of supervised image set)
│   └─  accident
│   │    └─ frame001.png
│   │       frame002.png
│   └─  no_accident
│        └─ frame001.png
│           frame002.png
│
└─── valid     (20% of supervised image set)
     └─  accident
     │    └─ frame001.png
     │       frame002.png
     └─  no_accident
          └─ frame001.png
             frame002.png

```

From CarCrashDetection/Sources/Frames , copy images 80 % to train (accident & no_accident) and 20% to valid (accident & no_accident)

In [0]:
PATH = 'GDrive/My\ Drive/CarCrashDetection/Sources/frames'
filenames_accident = os.listdir ("GDrive/My Drive/CarCrashDetection/Sources/frames/accident")

print (os.listdir('GDrive/My Drive/CarCrashDetection/Sources/frames'))
num_accident = len(filenames_accident)
filenames_no_accident = os.listdir ("GDrive/My Drive/CarCrashDetection/Sources/frames/no_accident")
num_no_accident = len(filenames_no_accident)

print ("Total images with accident ", num_accident)
print ("Total images without accident ", num_no_accident)

<a id="data_augmentation"> </a>
## 3.  Data augmentation of accident images set

Simply mirror images

In [0]:
data_augmentation = 0

if (data_augmentation == 1):
 
  PATH_accident = PATH + "/accident/"
  remove = PATH_accident+"_flip_*"  
  print("Deleting old mirrored images")
  !rm {remove}

  print("Creating new mirrored images")
  for fn in filenames_accident:
    img = mpimg.imread(PATH_accident+fn)
    rimg= cv2.flip(img,1)
    plt.imsave("%s%s%s.png" % (PATH_accident,"_flip_", fn[:-4]), rimg) # saves images to frame folder

PATH = 'GDrive/My\ Drive/CarCrashDetection/Sources/frames'
filenames_accident = os.listdir ("GDrive/My Drive/CarCrashDetection/Sources/frames/accident")

print (os.listdir('GDrive/My Drive/CarCrashDetection/Sources/frames'))
num_accident = len(filenames_accident)
filenames_no_accident = os.listdir ("GDrive/My Drive/CarCrashDetection/Sources/frames/no_accident")
num_no_accident = len(filenames_no_accident)

print ("Total images with accident ", num_accident)
print ("Total images without accident ", num_no_accident)

<a id="split_in_train_valid"> </a>
## 4.  Split dataset in train and valid randomly

Starting the split and copy of random images

In [0]:
percentage_of_train = 0.8    # 80% = 0.8

def split_dataset_in_train_and_valid( filenames, percentage):
  
  total_images = len(filenames)
  total_train = int(total_images * percentage_of_train)
  
  set_train = []
  
  while (len(set_train) != total_train):     
    num_image = random.randrange(total_images)
    if num_image not in set_train:
      set_train.append(num_image)

  set_valid = []
  for i in range(total_images):
    if i not in set_train:
      set_valid.append(i)
    
  images_train =[]
  for i in set_train:    
    images_train.append(filenames[i])
   
  images_valid =[]
  for i in set_valid:    
    images_valid.append(filenames[i])
  
  return (images_train, images_valid)

In [0]:
accident_train_valid = (split_dataset_in_train_and_valid (filenames_accident , percentage_of_train))
no_accident_train_valid = (split_dataset_in_train_and_valid (filenames_no_accident , percentage_of_train))

In [0]:
## Copy files from sources to train
PATH2 = "GDrive/My\ Drive/CarCrashDetection/Dataset/"

## copy accident train set
print ("Copying accident train")
lenght = len (accident_train_valid[0])
for i in range(lenght):
  src = ("GDrive/My Drive/CarCrashDetection/Sources/frames/accident/" + accident_train_valid[0][i])
  dst = ("GDrive/My Drive/CarCrashDetection/Dataset/train/accident/" + accident_train_valid[0][i])
  copyfile(src, dst)
  
train_accident = !ls {PATH2}train/accident
print(f"Number of train images with accident copied: {len(train_accident)}")

## copy accident valid set
print ("Copying accident valid")
lenght = len (accident_train_valid[1])
for i in range(lenght):
  src = ("GDrive/My Drive/CarCrashDetection/Sources/frames/accident/" + accident_train_valid[1][i])
  dst = ("GDrive/My Drive/CarCrashDetection/Dataset/valid/accident/" + accident_train_valid[1][i])
  copyfile(src, dst)
  
valid_accident = !ls {PATH2}valid/accident 
print(f"Number of valid images with accident copied: {len(valid_accident)}")

## copy no_accident train set
print ("Copying no accident train")
lenght = len (no_accident_train_valid[0])
for i in range(lenght):
  src = ("GDrive/My Drive/CarCrashDetection/Sources/frames/no_accident/" + no_accident_train_valid[0][i])
  dst = ("GDrive/My Drive/CarCrashDetection/Dataset/train/no_accident/" + no_accident_train_valid[0][i])
  copyfile(src, dst)

train_no_accident = !ls {PATH2}train/no_accident
print(f"Number of train images with no_accident copied: {len(train_no_accident)}")

## copy accident valid set
print ("Copying no accident valid")
lenght = len (no_accident_train_valid[1])
for i in range(lenght):
  src = ("GDrive/My Drive/CarCrashDetection/Sources/frames/no_accident/" + no_accident_train_valid[1][i])
  dst = ("GDrive/My Drive/CarCrashDetection/Dataset/valid/no_accident/" + no_accident_train_valid[1][i])
  copyfile(src, dst) 
  
valid_no_accident = !ls {PATH2}valid/no_accident
print(f"Number of valid images with no_accident copied: {len(valid_no_accident)}")