# CNN - Preparation

# 1. Dataset

In [1]:
import os

def list_non_hidden_files(path):
    return [f for f in os.listdir(path) if not f.startswith('.')]

LABEL_PATH = "/Users/ray/Desktop/26/Labels"

[f for f in list_non_hidden_files(LABEL_PATH) if not f.startswith('.')]

['0001-1500.csv', '4501-6000.csv', '1501-3000.csv', '3001-4500.csv']

Import all dataset:

In [2]:
import pandas as pd 
import numpy as np

data_list = list()
for file_name in sorted(list_non_hidden_files(LABEL_PATH)):
    if file_name.endswith("csv"):
        file_path = os.path.join(LABEL_PATH, file_name)
        data_list.append(pd.read_csv(file_path, index_col=0))
data = pd.concat(data_list)
data.head()

Unnamed: 0_level_0,crack_1,crack_2,crack_3
image_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00001.JPG,long,long,long
00002.JPG,none,none,none
00003.JPG,none,none,none
00004.JPG,none,none,none
00005.JPG,long,long,long


In [3]:
data.shape

(6000, 3)

In [4]:
counts = pd.Series(data.values.reshape(-1,)).value_counts() 
counts

none    14615
long     1543
lat       954
croc      645
diag      110
RAIl       78
rail       55
dtype: int64

In [5]:
counts / (data.shape[0] * data.shape[1])

none    0.811944
long    0.085722
lat     0.053000
croc    0.035833
diag    0.006111
RAIl    0.004333
rail    0.003056
dtype: float64

Rename `RAIl` to `rail`.

In [6]:
for col in data.columns:
    data[col] = data[col].str.lower()
counts = pd.Series(data.values.reshape(-1,)).value_counts() 
counts

none    14615
long     1543
lat       954
croc      645
rail      133
diag      110
dtype: int64

In [7]:
counts / (data.shape[0] * data.shape[1])

none    0.811944
long    0.085722
lat     0.053000
croc    0.035833
rail    0.007389
diag    0.006111
dtype: float64

Percentage of "crack" lable:

In [8]:
1 - counts[0]/(data.shape[0] * data.shape[1])

0.18805555555555553

Number of non-crack photos:

In [9]:
counts[0]

14615

Number of crack photos:

In [10]:
counts[1:].sum()

3385

In [11]:
data.to_csv(os.path.join(LABEL_PATH, "labels.csv"))

# 2. Reorganize Folder Structure for Training

The reason for doing this is total size for all compressed images is over 1GB, which loading all images as uncompressed numpy array would require several GBs of memory. Alternative approach is to load data from directory and train the model batch by batch.

In [12]:
import shutil

SRC = "/Users/ray/Desktop/crack-images/TH-Shuffled"

NAMES = ["Origin", "CC", "KNN", "TH", "MOG"]

DEST_1_list = ["/Users/ray/Desktop/crack-images/{}-Binary".format(name) for name in NAMES]
DEST_2_list = ["/Users/ray/Desktop/crack-images/{}-Multi".format(name) for name in NAMES]

for dest_1, dest_2 in zip(DEST_1_list, DEST_2_list):
    for path in [dest_1, dest_2]:
        if os.path.exists(path):
            shutil.rmtree(path) 
        os.mkdir(path)
        for subdir_1 in ["test", "train"]:
            subpath = os.path.join(path, subdir_1)
            os.mkdir(subpath)
            subdir_2_list = ["none", "cracked"]
            if path.endswith("Multi"):
                subdir_2_list = list(counts.index)
                subdir_2_list.remove("none")
            for subdir_2 in subdir_2_list:
                os.mkdir(os.path.join(subpath, subdir_2))

Split all images and save them to the corresponding folder. Shuffle all the split images, 75% to train and 25% to test.

For binary classification, make sure two types have the same number of photos, which means need to discard 
some portion of "none" type photos.

Save all split images in train set first.

In [13]:
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import save_img

SRC_list = ["/Users/ray/Desktop/crack-images/{}-Shuffled".format(name) for name in NAMES]

for src, dest_1, dest_2 in zip(SRC_list, DEST_1_list, DEST_2_list):
    print("Processing images in: {}".format(src))
    for image_name in data.index:
        # read image and crack information
        count = int(image_name.split(".")[0])
        image_path = os.path.join(src, image_name)
        types = data.loc[image_name,:]
        print(f"Now processing image: {image_name}", end="\r")

        # load image
        try:
            image = img_to_array(load_img((image_path)))
        except:
            image_name = image_name.split(".")[0] + ".png"
            image_path = os.path.join(src, image_name)
            image = img_to_array(load_img((image_path)))

        # crop and split image
        ideal_shape = (1230, 410, 3)
        crop_bound = {
            "xmin": int((image.shape[1] - ideal_shape[1]) / 2),
            "xmax": int((image.shape[1] + ideal_shape[1]) / 2 - 1),
            "ymin": int((image.shape[0] - ideal_shape[0]) / 2),
            "ymax": int((image.shape[0] + ideal_shape[0]) / 2 -1)
        }
        images_cropped = image[crop_bound["ymin"]:crop_bound["ymax"] + 1, crop_bound["xmin"]:crop_bound["xmax"] + 1]
        y_space = np.linspace(0, ideal_shape[0], 4).astype(int)
        y_space[-1] += 1
        images_split = [images_cropped[y_space[i]:y_space[i + 1],] for i in range(len(y_space) - 1)]

        # save image
        for i in range(3):
            sub_image = images_split[i]
            image_name = str(count).zfill(5) + f'-{i + 1}' + ".jpeg"
            label = types[i]
            folder = "train"
            if label != "none":
                save_img(dest_1 + f"/{folder}/cracked/{image_name}", sub_image)
                save_img(dest_2 + f"/{folder}/{label}/{image_name}", sub_image)
            else:
                save_img(dest_1 + f"/{folder}/{label}/{image_name}", sub_image)

Processing images in: /Users/ray/Desktop/crack-images/Origin-Shuffled
Processing images in: /Users/ray/Desktop/crack-images/CC-Shuffled
Processing images in: /Users/ray/Desktop/crack-images/KNN-Shuffled
Processing images in: /Users/ray/Desktop/crack-images/TH-Shuffled
Processing images in: /Users/ray/Desktop/crack-images/MOG-Shuffled
Now processing image: 06000.JPG

Remove redundant "none" type photos:

In [14]:
import random

for dest_1, dest_2 in zip(DEST_1_list, DEST_2_list):
    # calculate the number of each type
    all_crack_list = list_non_hidden_files(f"{dest_1}/train/cracked")
    all_none_list = list_non_hidden_files(f"{dest_1}/train/none")
    all_crack = len(all_crack_list)
    all_none = len(all_none_list)
    remove = all_none - all_crack

    # shuffle the crack images
    random.seed(100)
    random.shuffle(sorted(all_crack_list))
    random.shuffle(sorted(all_none_list))

    # remove redundant none images
    for _ in range(remove):
        image_name = all_none_list.pop(0)
        os.remove(f"{dest_1}/train/none/{image_name}")

    # train test split
    for dest in [dest_1, dest_2]:
        for label in list_non_hidden_files(f"{dest}/train/"):
            image_list = sorted(list_non_hidden_files(f"{dest}/train/{label}"))
            image_number = len(image_list)
            random.seed(100)
            random.shuffle(image_list)
            to_move = int(image_number * 0.25)
            for _ in range(to_move):
                image_name = image_list.pop(0)
                os.rename(f"{dest}/train/{label}/{image_name}", f"{dest}/test/{label}/{image_name}")

In [15]:
for dest_1, dest_2 in zip(DEST_1_list, DEST_2_list):
    print("* * * * * * * * * * * * * * * * * * * * * * * *")
    for dest in [dest_1, dest_2]:
        for data_set in list_non_hidden_files(dest):
            for label in list_non_hidden_files(f"{dest}/{data_set}/"):
                number = len(list_non_hidden_files(f"{dest}/{data_set}/{label}"))
                print(f"{dest}/{data_set}/{label}: {number}")

* * * * * * * * * * * * * * * * * * * * * * * *
/Users/ray/Desktop/crack-images/Origin-Binary/test/cracked: 846
/Users/ray/Desktop/crack-images/Origin-Binary/test/none: 846
/Users/ray/Desktop/crack-images/Origin-Binary/train/cracked: 2539
/Users/ray/Desktop/crack-images/Origin-Binary/train/none: 2539
/Users/ray/Desktop/crack-images/Origin-Multi/test/diag: 27
/Users/ray/Desktop/crack-images/Origin-Multi/test/croc: 161
/Users/ray/Desktop/crack-images/Origin-Multi/test/long: 385
/Users/ray/Desktop/crack-images/Origin-Multi/test/rail: 33
/Users/ray/Desktop/crack-images/Origin-Multi/test/lat: 238
/Users/ray/Desktop/crack-images/Origin-Multi/train/diag: 83
/Users/ray/Desktop/crack-images/Origin-Multi/train/croc: 484
/Users/ray/Desktop/crack-images/Origin-Multi/train/long: 1158
/Users/ray/Desktop/crack-images/Origin-Multi/train/rail: 100
/Users/ray/Desktop/crack-images/Origin-Multi/train/lat: 716
* * * * * * * * * * * * * * * * * * * * * * * *
/Users/ray/Desktop/crack-images/CC-Binary/test/cr