# 使用 tf.Dataset 進行完整的訓練

In [None]:
# upload Data
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/CVCNN_Data/cat_dog.zip
!unzip -q cat_dog

## 匯入套件

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import glob  # 讀取特定格式路徑

import tensorflow as tf
from tensorflow.keras import layers

## 讀取資料

In [None]:
# 建立一個字典來存放路徑跟標籤資訊
data_dict={'file_name': [], 'type': []}
# 只拿 train 資料中的 .jpg 檔案
for i in glob.glob('cat_dog/train/*.jpg'):
    # i 會類似 cat_dog/train/cat.11996.jpg
    data_dict['file_name'].append(i)
    # 字串處理取出檔案名稱前三個字元來判斷類別
    animal = i.split('/')[-1][:3]
    if animal == 'cat':
        data_dict['type'].append(0)
    elif animal == 'dog':
        data_dict['type'].append(1)
    else:
        print(i)

In [None]:
# 將字典轉換成 DataFrame
datalist = pd.DataFrame(data_dict)
shuffled_df = datalist.sample(frac=1, random_state=2)  # 打亂順序

In [None]:
shuffled_df.head()

In [None]:
len(shuffled_df)

## 切分訓練/驗證集

In [None]:
# 切分訓練/測試資料
train_data = shuffled_df[:500]
val_data = shuffled_df[500:1000]
test_data = shuffled_df[1000:3000]

## 資料前處理：

In [None]:
import random
def my_preprocess(img_path, img_label):
    image = tf.io.read_file(img_path)
    image = tf.image.decode_jpeg(image, channels=3)

    image = tf.image.resize(image, [256, 256])
    
    return image, tf.one_hot(img_label, depth=2)  # depth=類別數量

In [None]:
# 使用 tf.data.Dataset 製造一個 Dataset
train_path = train_data['file_name']
train_label = train_data['type']
train_dataset = tf.data.Dataset.from_tensor_slices((train_path, train_label))  
train_dataset = train_dataset.map(
    lambda train_path, train_label: my_preprocess(train_path, train_label),  # 應用資料前處理
    num_parallel_calls=tf.data.AUTOTUNE)    

In [None]:
# 使用 tf.data.Dataset 製造一個 Dataset
val_path = val_data['file_name']
val_label = val_data['type']
val_dataset = tf.data.Dataset.from_tensor_slices((val_path, val_label))
val_dataset = val_dataset.map(
    lambda val_path, val_label: my_preprocess(val_path, val_label))  # 應用資料前處理

In [None]:
def my_plot(datas):
    plt.figure(figsize=(13, 7))
    for i,data in enumerate(datas):
        plt.subplot(2, 4, i+1)
        plt.imshow(data[0].numpy().astype('uint8'))
        plt.title("Label: {}".format(data[1]), fontsize=16)
    plt.show()

my_plot(train_dataset.take(8))

## 建立神經網路

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import (Input, Dense, Dropout, Activation,
                                     BatchNormalization, Flatten,
                                     Conv2D, MaxPooling2D)

In [None]:
# 選擇 Keras 的 API 寫法
inputs = Input(shape=(256, 256, 3)) #輸入資料維度
# 前處理：隨機旋轉
x = layers.RandomRotation(factor=(-0.3, 0.3),
                          fill_mode="reflect")(inputs)
# 第一層
# 建立卷積層，設定32個3*3的filters
# 設定ReLU為激活函數。
x = Conv2D(32, (3, 3), activation='relu')(x)

# 第二層 - 卷積層 + 池化層
x = Conv2D(32, (3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

# 第三層 - 卷積層
x = Conv2D(64, (3, 3), activation='relu')(x)

# 第四層 - 卷積層 + 池化層
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

# 建立分類模型 (MLP) : 平坦層 + 輸出層 (10)
x = Flatten()(x)
outputs = Dense(2, activation='softmax')(x) # 輸出類別數量


cnn_model = Model(inputs=inputs, outputs=outputs)

In [None]:
cnn_model.summary()

In [None]:
cnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

## 開始訓練

In [None]:
batch_size = 128
train_dataset_batch = train_dataset.batch(batch_size)  # dataset 參考 DL par4 Custom_dataset
val_dataset_batch = val_dataset.batch(batch_size)  # dataset 參考 DL par4 Custom_dataset

cnn_model.fit(train_dataset_batch,  # 訓練一圈次數=15000/128 =118圈
              validation_data=val_dataset_batch,
              epochs=30)

## 測試資料 (模擬沒有答案的測試資料)

In [None]:
import random
def my_preprocess_test(img_path):
    image = tf.io.read_file(img_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [256, 256])
    
    return image  # depth=類別數量

In [None]:
# 使用 tf.data.Dataset 製造一個 Dataset
test_path = test_data['file_name']
test_dataset = tf.data.Dataset.from_tensor_slices((test_path))  
test_dataset = test_dataset.map(lambda test_path: my_preprocess_test(test_path))  # 應用資料前處理

In [None]:
test_dataset.take(10)

In [None]:
pred = cnn_model.predict(test_dataset.batch(128))

In [None]:
print(pred[:5])

## 解析模型預測結果，並填入Dataframe中

In [None]:
preditc_label = np.argmax(pred,axis=1)

In [None]:
test_df = pd.DataFrame({'file_name': test_path, 'prediction': preditc_label})


In [None]:
test_df.head()