In [15]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import os
import cv2
import threading
from scipy import ndimage

from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array

# UTILITY

In [16]:
import os
dataset_dir = 'original_dataset'


def mk_new_dir(parent):
    os.mkdir(parent)

    for label in os.listdir(f'{dataset_dir}'):
        os.mkdir(f'{parent}/{label}')


def count_dataset(parent):
    for label in os.listdir(parent):
        label_dir = f'{parent}/{label}'
        print(f'{label}:{len (os.listdir(label_dir)) }')


# CROP AND RESIZE TO 255 X 255

In [17]:
def crop_resize(img):
    new_size = img.shape[0] if img.shape[0] < img.shape[1] else img.shape[1]

    # ketika citra landscape
    if img.shape[0] < img.shape[1]:
        starting_w_point = (img.shape[1]-new_size) // 2
        new_image = img[:,  starting_w_point:starting_w_point+new_size, :]

    # ketika citra potrait
    else:
        starting_h_point = (img.shape[1]-new_size) // 2
        new_image = img[starting_h_point:starting_h_point+new_size, :, :]

    new_image = cv2.resize(new_image, (255, 255), interpolation=cv2.INTER_AREA)
    return new_image


# DATA AUGMENTATION UTILITY

In [18]:
from tensorflow.keras import layers
import tensorflow as tf

# data augmentation
def data_augmentation(label, multiplier):
    print('Processing '+label+' thread started')
    data_aug = tf.keras.Sequential([
        layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"), #metode membalik citra
        layers.experimental.preprocessing.RandomRotation(0.2, fill_mode='constant', fill_value=255), #metode rotasi citra 
        layers.experimental.preprocessing.RandomTranslation(0.1, 0.1, fill_mode='constant', fill_value=255) #metode translasi
    ])
    # add image to batch
    images = []
    filenames = []
    for filename in os.listdir('original_dataset/'+label):
        img_dir = f'original_dataset/{label}/{filename}'
        img_pil = load_img(img_dir)
        img = img_to_array(img_pil)
        img = img.astype(np.uint8)
        # crop and resize
        img = crop_resize(img)

        images.append(img)
        filenames.append(filename)


    images = np.array(images)
    print(images.shape)

    for i in range(multiplier):
        aug_images = data_aug(images)
        for j in range(len(aug_images)):
            tf.keras.utils.save_img(
                'augmented_dataset/'+label+'/'+str(i)+'_' + filenames[j], aug_images[j])


In [19]:
# CEK JUMLAH DATASET MASING MASING LABEL
count_dataset('original_dataset')
count_dataset('augmented_dataset')

# kita target 500 gambar perlabel
target = 500

fresh_apple:182
stale_apple:253
fresh_apple:546
stale_apple:506


In [None]:
# inisialisasi folder
mk_new_dir('augmented_dataset')

In [20]:
import _thread as thread
for label in os.listdir('original_dataset'):
    mult = target//len(os.listdir('original_dataset/'+label)) + 1
    try:
        data_augmentation(label, mult)
    except Exception as e:
        print('error at thread ', label)
        print(e)

Processing fresh_apple thread started
(182, 255, 255, 3)
Processing stale_apple thread started
(253, 255, 255, 3)


# SEGMENTASI

In [None]:
# membuat folder untuk segmentasi dataset
mk_new_dir('segmented_dataset')

In [21]:
augmented_dir = 'augmented_dataset'

In [22]:
# Fungsi ini dibuat untuk melakukan segmentasi citra (memisahkan objek dengan backgroundnya) 

def segmentasi(label):
    for filename in os.listdir(f'{augmented_dir}/{label}'):
        # read image
        img = plt.imread(f'augmented_dataset/{label}/{filename}')

        # convert ke grb
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

        # threshold
        img_binary = gray<160

        # cari mask
        mask = ndimage.binary_fill_holes(img_binary)
        mask_int = (mask*1).astype(np.uint8)
        mask_erode = cv2.erode(mask_int,
                        np.array([[1, 1, 1],
                                  [1, 1, 1],
                                  [1, 1, 1],
                                  ]), 2)

        # masking
        with_mask = img * np.repeat(mask_erode[...,None], 3, axis=2)
        
        # croping
        cnts = cv2.findContours(mask_erode, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cnts = cnts[0] if len(cnts) == 2 else cnts[1]
        x,y,w,h = cv2.boundingRect(cnts[len(cnts)-1])
        with_mask_croped = with_mask[ y:y+h,x:x+w, :]

        plt.imsave( f'segmented_dataset/{label}/{filename}', with_mask_croped )

In [23]:
threads = []

for label in os.listdir(dataset_dir):
    t = threading.Thread(target=segmentasi, args=(label,))
    threads.append(t)

for thread in threads:
    thread.start()

In [24]:
def features(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)                      #ubah ke bentuk hsv
    hue = hsv[:,:,0]                                                #mengambil nilai hue dari format hsv
    hist = np.histogram(hue, bins=np.arange(256), density=True)
    return hist[0]

In [25]:
labels = {
    'fresh_apple': 0,
    'stale_apple': 1,
}

# EKSTRAKSI WARNA

In [27]:
all_features = []
for label in os.listdir('segmented_dataset'):
    for filename in os.listdir(f'segmented_dataset/{label}'):
        img_path = f'segmented_dataset/{label}/{filename}'
        img = plt.imread(img_path)
        feature = [filename, *features(img), labels[label]]
        all_features.append(feature)

# konversi ke numpy object
all_features = np.array(all_features)

# buat kolom untuk setiap nilai hue dan label di last column
column = ['img_ids', * np.arange(255), 'label']

# simpan dataframe
trains_df = pd.DataFrame(all_features, columns=column)
trains_df.to_csv('color_dataset.csv')

# MODEL BUILDING COLOR

In [28]:
dataframe = pd.read_csv('color_dataset.csv')
dataframe.head()

Unnamed: 0.1,Unnamed: 0,img_ids,0,1,2,3,4,5,6,7,...,246,247,248,249,250,251,252,253,254,label
0,0,0_Screen Shot 2018-06-08 at 4.59.36 PM.png,0.464249,0.065132,0.081295,0.077266,0.062982,0.048076,0.034349,0.025591,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,0_Screen Shot 2018-06-08 at 4.59.49 PM.png,0.500212,0.049599,0.037056,0.042468,0.043359,0.029628,0.021711,0.022263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1e-05,0
2,2,0_Screen Shot 2018-06-08 at 4.59.57 PM.png,0.789681,0.029041,0.028958,0.020588,0.008913,0.003515,0.003264,0.003557,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,0_Screen Shot 2018-06-08 at 5.00.03 PM.png,0.657682,0.054741,0.046521,0.063355,0.061896,0.012435,0.009378,0.008336,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,0_Screen Shot 2018-06-08 at 5.00.12 PM.png,0.504094,0.034468,0.04286,0.044064,0.058678,0.05722,0.027059,0.018378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [29]:
os.listdir(
    'segmented_dataset'
)

['fresh_apple', 'stale_apple']

In [30]:
clean_data = dataframe.drop(columns=['Unnamed: 0', 'img_ids'])
clean_data = clean_data.replace({
    'fresh_apple': 0,
    'stale_apple': 1,
})
clean_data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,label
0,0.464249,0.065132,0.081295,0.077266,0.062982,0.048076,0.034349,0.025591,0.018202,0.014061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.500212,0.049599,0.037056,0.042468,0.043359,0.029628,0.021711,0.022263,0.02082,0.022773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1e-05,0
2,0.789681,0.029041,0.028958,0.020588,0.008913,0.003515,0.003264,0.003557,0.003264,0.003348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.657682,0.054741,0.046521,0.063355,0.061896,0.012435,0.009378,0.008336,0.007873,0.007341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.504094,0.034468,0.04286,0.044064,0.058678,0.05722,0.027059,0.018378,0.015293,0.012732,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [31]:
len(clean_data.columns)

256

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


x = clean_data.iloc[:,:-1]
y = clean_data.iloc[:,-1:]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

scaler = MinMaxScaler()
x_train_scalled = scaler.fit_transform(x_train)
x_test_scalled = scaler.transform(x_test)

In [33]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors", 
         "Linear SVM", 
         "RBF SVM", 
         "Decision Tree", 
         "Random Forest", 
         "Neural Network", 
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(max_iter=5000),
    GaussianNB(),
    ]

In [34]:
for name, model in zip(names, classifiers):
  model.fit(x_train_scalled, y_train.values.ravel())
  y_pred_model = model.predict(x_test_scalled)
  print(f'## MODEL NAME : {name}')
  print(confusion_matrix(y_test, y_pred_model))
  print(classification_report(y_test, y_pred_model))
  print('\n')

## MODEL NAME : Nearest Neighbors
[[ 86   9]
 [ 15 101]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        95
           1       0.92      0.87      0.89       116

    accuracy                           0.89       211
   macro avg       0.88      0.89      0.89       211
weighted avg       0.89      0.89      0.89       211



## MODEL NAME : Linear SVM
[[ 77  18]
 [ 10 106]]
              precision    recall  f1-score   support

           0       0.89      0.81      0.85        95
           1       0.85      0.91      0.88       116

    accuracy                           0.87       211
   macro avg       0.87      0.86      0.86       211
weighted avg       0.87      0.87      0.87       211



## MODEL NAME : RBF SVM
[[ 90   5]
 [ 11 105]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        95
           1       0.95      0.91      0.93       116

    accuracy            

In [35]:
correlation = {}
label = clean_data['label']
for col in clean_data.columns:
  correlation[col] = label.corr(clean_data[col])
sorted_correlation = sorted(correlation.items(), key=lambda x: x[1])
print(sorted_correlation)
len(sorted_correlation)

[('2', -0.3803849219079016), ('3', -0.3630540302388689), ('1', -0.36120411795860524), ('90', -0.3476533290230483), ('0', -0.3234773092014505), ('69', -0.30974609641011863), ('72', -0.3090792673262374), ('68', -0.3090583653363585), ('80', -0.2995090631730869), ('67', -0.2956565914770542), ('70', -0.2932132703014692), ('79', -0.29181685096749077), ('75', -0.28927610209945054), ('73', -0.2861862681904432), ('77', -0.28172509245488364), ('4', -0.28082415371044567), ('83', -0.28044185221798285), ('78', -0.28007748883866285), ('76', -0.2780501119451788), ('88', -0.27757799576063996), ('74', -0.2753576545530219), ('81', -0.27484667358951687), ('82', -0.2687093822584774), ('89', -0.26376267985138785), ('66', -0.26265132310862443), ('71', -0.25696225761108943), ('87', -0.2551071356622639), ('86', -0.24857605147095302), ('84', -0.2443178655700492), ('65', -0.2399649065554915), ('91', -0.22699549682449482), ('85', -0.22250378137501528), ('120', -0.21435697951202845), ('93', -0.2138789824632109), 

256

In [36]:
positive_corr = {}
negative_corr = {}
for (k,v) in sorted_correlation:
  if(v>0):
    positive_corr[k] = v
  else:
    negative_corr[k] = v
len(positive_corr)

126

In [37]:
from sklearn.decomposition import PCA

pca = PCA(n_components = len(positive_corr))
x_train_pca = pca.fit_transform(x_train_scalled)
x_test_pca = pca.transform(x_test_scalled)

In [38]:
for name, model in zip(names, classifiers):
  model.fit(x_train_pca, y_train.values.ravel())
  y_pred_model = model.predict(x_test_pca)
  try:
    print(f'## MODEL NAME : {name}')
    print(confusion_matrix(y_test, y_pred_model))
    print(classification_report(y_test, y_pred_model))
    print('\n')
  except:
    print("Hello!!!!")

## MODEL NAME : Nearest Neighbors
[[ 86   9]
 [ 15 101]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        95
           1       0.92      0.87      0.89       116

    accuracy                           0.89       211
   macro avg       0.88      0.89      0.89       211
weighted avg       0.89      0.89      0.89       211



## MODEL NAME : Linear SVM
[[ 77  18]
 [ 10 106]]
              precision    recall  f1-score   support

           0       0.89      0.81      0.85        95
           1       0.85      0.91      0.88       116

    accuracy                           0.87       211
   macro avg       0.87      0.86      0.86       211
weighted avg       0.87      0.87      0.87       211



## MODEL NAME : RBF SVM
[[ 89   6]
 [ 11 105]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.91        95
           1       0.95      0.91      0.93       116

    accuracy            