### 목표

- 품종, 등급 예측하기

In [1]:
import os
import shutil
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt, cv2
import matplotlib.font_manager as fm
import seaborn as sns
from PIL import Image
from tqdm import tqdm

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split

logging.getLogger("tensorflow").setLevel(logging.ERROR)
font_name = fm.FontProperties(fname="C:/Windows/Fonts/malgun.ttf").get_name()
plt.rc('font', family=font_name)
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'

In [2]:
df = pd.read_csv('apple.csv')
df.sample(5)

Unnamed: 0,파일이름,파일경로,사과_당도_품질_클래스,품종,착즙당도,비파괴당도,토양_전기전도도,토양_온도,토양_습도,토양_수분장력,온도,습도,일사량
14006,20210829_RGB_10.8_F01_HR_03_007_02_0_A.jpg,홍로/당도B등급,C,HR,,10.8,0.35,21.42,64.4,-0.1,22.49,90.38,70.0
20605,20210912_RGB_09.1_F06_HR_04_041_07_45_H.jpg,홍로/당도C등급,C,HR,,9.1,0.74,18.8,27.3,-0.1,20.99,90.26,546.0
18649,20210912_RGB_11.8_F04_HR_06_014_09_90_H.jpg,홍로/당도B등급,C,HR,13.0,11.8,0.43,20.98,28.21,-0.2,20.48,86.0,422.0
30169,20211030_RGB_14.8_F13_HJ_10_113_13_0_H.jpg,후지/당도A등급,A,HJ,,14.8,1.77,10.1,12.21,-51.0,10.55,85.18,256.0
34766,20210926_RGB_13.7_F14_HJ_07_001_03_0_A.jpg,후지/당도B등급,B,HJ,,13.7,0.86,19.92,12.6,-0.1,20.21,82.73,343.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48020 entries, 0 to 48019
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   파일이름          48020 non-null  object 
 1   파일경로          48020 non-null  object 
 2   사과_당도_품질_클래스  48020 non-null  object 
 3   품종            48020 non-null  object 
 4   착즙당도          10153 non-null  float64
 5   비파괴당도         48020 non-null  float64
 6   토양_전기전도도      48020 non-null  float64
 7   토양_온도         48020 non-null  float64
 8   토양_습도         48020 non-null  float64
 9   토양_수분장력       48020 non-null  float64
 10  온도            48020 non-null  float64
 11  습도            48020 non-null  float64
 12  일사량           48020 non-null  float64
dtypes: float64(9), object(4)
memory usage: 4.8+ MB


In [4]:
df.describe()

Unnamed: 0,착즙당도,비파괴당도,토양_전기전도도,토양_온도,토양_습도,토양_수분장력,온도,습도,일사량
count,10153.0,48020.0,48020.0,48020.0,48020.0,48020.0,48020.0,48020.0,48020.0
mean,13.369743,13.080877,1.020875,14.856226,20.762857,-27.926826,17.239501,79.11844,360.188317
std,1.358879,1.983554,0.567566,5.207573,10.247156,55.833452,5.984801,11.499,128.672372
min,7.0,9.0,0.03,7.28,0.01,-407.3,1.2,38.73,3.0
25%,12.5,11.6,0.61,9.66,12.21,-51.0,11.54,72.66,269.0
50%,13.4,13.0,0.98,17.83,21.58,-5.4,19.89,78.69,356.0
75%,14.3,14.3,1.33,19.26,28.59,-0.1,22.8,89.64,460.0
max,19.2,19.0,4.61,24.17,76.64,-0.1,28.91,99.99,798.0


In [5]:
corr = df.corr()
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr.style.background_gradient(cmap='coolwarm', axis=None,
                                vmin=-1, vmax=1).highlight_null(
                                    null_color='#f1f1f1')  # Color NaNs grey
 .set_precision(2))

  (corr.style.background_gradient(cmap='coolwarm', axis=None,


Unnamed: 0,착즙당도,비파괴당도,토양_전기전도도,토양_온도,토양_습도,토양_수분장력,온도,습도,일사량
착즙당도,,,,,,,,,
비파괴당도,0.49,,,,,,,,
토양_전기전도도,0.18,-0.03,,,,,,,
토양_온도,-0.14,-0.51,0.12,,,,,,
토양_습도,-0.03,-0.13,-0.19,0.13,,,,,
토양_수분장력,-0.1,-0.17,0.16,0.28,0.42,,,,
온도,-0.09,-0.42,0.22,0.9,0.0,0.15,,,
습도,-0.22,-0.27,-0.18,0.4,0.21,0.32,0.17,,
일사량,-0.05,-0.13,0.21,0.25,0.13,0.3,0.35,-0.2,


In [6]:
df = df.drop(['착즙당도', '온도'], axis=1)

In [7]:
df = pd.get_dummies(df, columns=['사과_당도_품질_클래스', '품종'])

In [8]:
X = df[[
    '파일이름', '파일경로', '비파괴당도', '토양_전기전도도', '토양_온도', '토양_습도', '토양_수분장력', '습도',
    '일사량'
]]
y = df[[
    '사과_당도_품질_클래스_A', '사과_당도_품질_클래스_B', '사과_당도_품질_클래스_C', '품종_AR', '품종_HJ',
    '품종_HR', '품종_SG'
]]

In [9]:
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    X, y, test_size=0.2, random_state=527, stratify=y)

In [10]:
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)


def copy_file_to_target(x, target):
    cnt = 1
    for path, name in tqdm(zip(x['파일경로'], x['파일이름'])):
        source_path = f'apple_bbox/{path}/{name}'
        target_path = f'apple_bbox/{target}'

        create_dir(target_path)
        shutil.copy(source_path, target_path)
        filename = str(cnt).rjust(5, '0')
        os.rename(f'{target_path}/{name}', f'{target_path}/{filename}.jpg')

        cnt += 1

In [11]:
try:
    copy_file_to_target(X_train_df, 'train')
    copy_file_to_target(X_test_df, 'validation')
except:
    pass

0it [00:00, ?it/s]


In [12]:
X_train_df = X_train_df.drop(['파일이름', '파일경로'], axis=1)
X_test_df = X_test_df.drop(['파일이름', '파일경로'], axis=1)

In [13]:
IMG_SIZE = 220
BATCH_SIZE = 128

X_train_img = tf.keras.utils.image_dataset_from_directory(
    'apple_bbox/train',
    label_mode=None,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE)

X_test_img = tf.keras.utils.image_dataset_from_directory(
    'apple_bbox/validation',
    label_mode=None,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE)

Found 38417 files belonging to 1 classes.
Found 9604 files belonging to 1 classes.


In [14]:
X_train_df = tf.data.Dataset.from_tensor_slices(X_train_df).batch(BATCH_SIZE)
X_test_df = tf.data.Dataset.from_tensor_slices(X_test_df).batch(BATCH_SIZE)
y_train_df = tf.data.Dataset.from_tensor_slices(y_train_df).batch(BATCH_SIZE)
y_test_df = tf.data.Dataset.from_tensor_slices(y_test_df).batch(BATCH_SIZE)

In [15]:
img_data_shape = (IMG_SIZE, IMG_SIZE, 3)
csv_data_shape = (7, )
num_classes = 7

# define two inputs layers
img_input = layers.Input(shape=img_data_shape, name="image")
csv_input = layers.Input(shape=csv_data_shape, name="csv")

In [16]:
# define layers for image data
x1 = layers.experimental.preprocessing.Rescaling(1. / 255)(img_input)
x1 = layers.Conv2D(16, 3, activation='relu', name="conv1_img")(x1)
x1 = layers.MaxPooling2D(name="mxp1_img")(x1)
x1 = layers.Conv2D(32, 3, activation='relu', name="conv2_img")(x1)
x1 = layers.MaxPooling2D(name="mxp2_img")(x1)
x1 = layers.Conv2D(64, 3, activation='relu', name="conv3_img")(x1)
x1 = layers.MaxPooling2D(name="mxp3_img")(x1)
x1 = layers.Flatten(name="flatten_img")(x1)

In [17]:
# define layers for csv data
x2 = layers.Flatten(name="flatten_csv")(csv_input)
x2 = layers.Dense(16, activation='relu', name="dense1_csv")(x2)
x2 = layers.Dense(32, activation='relu', name="dense2_csv")(x2)
x2 = layers.Dense(64, activation='relu', name="dense3_csv")(x2)

In [18]:
# merge layers
x = layers.concatenate([x1, x2], name="concat_csv_img")
x = layers.Dense(128, activation='relu', name="dense1_csv_img")(x)
output = layers.Dense(num_classes, name="softmax")(x)

In [19]:
# make model with 2 inputs and 1 output
model = tf.keras.models.Model(inputs=[img_input, csv_input], outputs=output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image (InputLayer)             [(None, 220, 220, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling (Rescaling)          (None, 220, 220, 3)  0           ['image[0][0]']                  
                                                                                                  
 conv1_img (Conv2D)             (None, 218, 218, 16  448         ['rescaling[0][0]']              
                                )                                                                 
                                                                                              

In [21]:
len(X_train_img), len(X_train_df), len(y_train_df)

(301, 301, 301)

In [22]:
def my_gen(subset):
    while True:
        if subset == "training":
            for i in X_train_img.take(1):
                img_batch = i
            for j in X_train_df.take(1):
                csv_batch = j
            for k in y_train_df.take(1):
                labels_batch = k
        else:
            for i in X_test_img.take(1):
                img_batch = i
            for j in X_test_df.take(1):
                csv_batch = j
            for k in y_test_df.take(1):
                labels_batch = k

        yield ((img_batch, csv_batch), labels_batch)


gen_train = my_gen("training")
gen_valid = my_gen("validation")

In [23]:
modelpath = "./mpg/model/all/{epoch:03d}-{accuracy:.4f}.hdf5"
checkpointer = ModelCheckpoint(filepath=modelpath, verbose=1)

hist = model.fit(gen_train,
                 epochs=10,
                 steps_per_epoch=301,
                 verbose=1,
                 callbacks=[checkpointer])

Epoch 1/10
Epoch 1: saving model to ./mpg/model/all\001-0.3979.hdf5
Epoch 2/10
Epoch 2: saving model to ./mpg/model/all\002-0.3984.hdf5
Epoch 3/10
Epoch 3: saving model to ./mpg/model/all\003-0.3984.hdf5
Epoch 4/10
Epoch 4: saving model to ./mpg/model/all\004-0.3984.hdf5
Epoch 5/10
Epoch 5: saving model to ./mpg/model/all\005-0.3984.hdf5
Epoch 6/10
Epoch 6: saving model to ./mpg/model/all\006-0.3984.hdf5
Epoch 7/10
Epoch 7: saving model to ./mpg/model/all\007-0.3984.hdf5
Epoch 8/10
Epoch 8: saving model to ./mpg/model/all\008-0.3984.hdf5
Epoch 9/10
Epoch 9: saving model to ./mpg/model/all\009-0.3984.hdf5
Epoch 10/10
Epoch 10: saving model to ./mpg/model/all\010-0.3984.hdf5


In [26]:
model.predict(gen_valid, steps=76)



array([[-317.1862 ,  391.2189 , -311.76593, ..., -204.22757, -224.17372,
         251.41994],
       [-420.60403,  507.599  , -404.0326 , ..., -264.87897, -279.5936 ,
         319.1264 ],
       [-234.17801,  282.6082 , -229.94583, ..., -154.24205, -164.4489 ,
         174.58665],
       ...,
       [-292.74985,  350.5939 , -284.89893, ..., -189.71933, -198.47758,
         214.93901],
       [-433.363  ,  537.3455 , -437.95782, ..., -290.25513, -320.7369 ,
         349.22824],
       [-342.0918 ,  418.60858, -338.93155, ..., -224.80486, -242.52678,
         266.26505]], dtype=float32)