In [1]:
# Import các thư viện cần thiết
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.utils import class_weight

# Đường dẫn đến dữ liệu
train_dir = '/kaggle/input/ai-training-challenge-hutech-orange-classifier/old_oranges_data_1/old_oranges_data/train_set'  
test_dir = '/kaggle/input/ai-training-challenge-hutech-orange-classifier/old_oranges_data_1/old_oranges_data/test_set'  

# Tiền xử lý dữ liệu
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest',
    validation_split=0.2
)

test_datagen = ImageDataGenerator(rescale=1./255)

# Tạo generator cho tập train và validation
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='training'
)

validation_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='validation'
)

# Tạo generator cho tập test
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=False
)

# Tính toán class weights để xử lý mất cân bằng dữ liệu
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(train_generator.classes),
    y=train_generator.classes
)
class_weights = dict(enumerate(class_weights))

# Sử dụng EfficientNetB0 làm mô hình cơ sở
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Đóng băng các lớp của mô hình cơ sở

# Thêm các lớp tùy chỉnh
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='sigmoid')(x)

# Tạo mô hình cuối cùng
model = Model(inputs=base_model.input, outputs=predictions)

# Biên dịch mô hình
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

# Huấn luyện mô hình
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    epochs=30,
    callbacks=[early_stopping, checkpoint, reduce_lr],
    class_weight=class_weights
)

# Fine-tuning
for layer in base_model.layers[-20:]:
    layer.trainable = True

model.compile(optimizer=Adam(learning_rate=0.00001), loss='binary_crossentropy', metrics=['accuracy'])

fine_tune_history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    epochs=10,
    callbacks=[early_stopping, checkpoint, reduce_lr],
    class_weight=class_weights
)

# Đánh giá mô hình trên tập test
test_loss, test_acc = model.evaluate(test_generator)
print(f'Test Accuracy: {test_acc:.4f}')

# Dự đoán trên tập test và tạo file submission.csv
predictions = model.predict(test_generator)
predictions = (predictions > 0.5).astype(int).flatten()

filenames = test_generator.filenames
results = pd.DataFrame({
    'image_name': filenames,
    'label': predictions
})

results.to_csv('submission.csv', index=False)
print("File submission.csv đã được tạo thành công!")

Found 1761 images belonging to 2 classes.
Found 440 images belonging to 2 classes.
Found 400 images belonging to 2 classes.
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/30


  self._warn_if_super_not_called()


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 890ms/step - accuracy: 0.4709 - loss: 1.0776 - val_accuracy: 0.4159 - val_loss: 0.6947 - learning_rate: 1.0000e-04
Epoch 2/30
[1m 1/55[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 32ms/step - accuracy: 0.6562 - loss: 0.7035

  self.gen.throw(typ, value, traceback)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 102ms/step - accuracy: 0.6562 - loss: 0.7035 - val_accuracy: 0.5000 - val_loss: 0.6932 - learning_rate: 1.0000e-04
Epoch 3/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 498ms/step - accuracy: 0.5134 - loss: 0.9299 - val_accuracy: 0.4231 - val_loss: 0.6987 - learning_rate: 1.0000e-04
Epoch 4/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5000 - loss: 0.8595 - val_accuracy: 0.3750 - val_loss: 0.7025 - learning_rate: 1.0000e-04
Epoch 5/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 494ms/step - accuracy: 0.5003 - loss: 0.9432 - val_accuracy: 0.4255 - val_loss: 0.7107 - learning_rate: 1.0000e-04
Epoch 6/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4688 - loss: 1.2312 - val_accuracy: 0.3333 - val_loss: 0.7276 - learning_rate: 2.0000e-05
Epoch 7/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m