In [1]:
import os
import numpy as np
from tensorflow.keras.applications import VGG16, ResNet152
# from tensorflow.keras.applications import DenseNet121,DenseNet201,EfficientNetB4,InceptionV3,ResNet152,VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
from PIL import Image
from tensorflow.keras.layers import GlobalAveragePooling2D
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from keras.utils import to_categorical
from sklearn.metrics import roc_curve, auc
from keras.backend import clear_session

In [2]:
#Mount google drive
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Progression_DR/Improvement_Worsening')

Mounted at /content/drive


In [3]:
image_paths = []
labels = []
file_paths = []

In [4]:
df = pd.read_csv('DR_progress_data_final_Worsen_aug.csv')

In [5]:
all_relevant_rows = []
# Iterate over directories and files
for i in range(-1, 2):
    directory_path = os.path.join('DR_Worsening_aug_final', str(i))

    # Use a list comprehension to get the image files in the directory
    image_files = [file_ for file_ in os.listdir(directory_path) if file_.endswith(('.jpg', '.png', '.jpeg'))]

    # Use a list comprehension to get the file IDs from the image files
    file_ids = [file_.split('.')[0] for file_ in image_files]

    # Filter the DataFrame to get the rows corresponding to the file IDs
    relevant_rows = df[df['ID1'].isin(file_ids)]

    for file_ in image_files:
        file_id = file_.split('.')[0]

        # Retrieve the label for the current file_id
        filtered_rows = relevant_rows.loc[relevant_rows['ID1'] == file_id]
        label = filtered_rows['y_od_os'].values

        if label.size > 0:
            labels.append(label[0])
            all_relevant_rows.append(filtered_rows.iloc[0])
            file_path = os.path.join(directory_path, file_)
            image_paths.append(file_path)
        else:
            print(f'No label found for file ID: {file_id}')

# Verify the lengths match
assert len(image_paths) == len(labels), "Mismatch between image paths and labels"

In [6]:
print(len(labels))
print(len(image_paths))

30000
30000


In [9]:
# Ensure train_labels are one-hot encoded
labels = to_categorical(labels, num_classes=3, dtype='uint8')
print(labels[:10])
print(len(labels))

[[0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]]
30000


In [10]:
# First split: train+validation and test
train_val_paths, test_image_paths, _, test_rows, train_val_labels, test_labels = train_test_split(image_paths, all_relevant_rows, labels, test_size=0.3, random_state=42)

# Second split: train and validation
train_paths, val_paths, train_labels, val_labels = train_test_split(train_val_paths, train_val_labels, test_size=0.25, random_state=42)

# train_image_paths, test_image_paths, _, test_rows, train_labels, test_labels = train_test_split(image_paths, all_relevant_rows, labels, test_size=0.3, random_state=42)
print(len(train_val_paths))
print(len(train_val_labels))
print("len(val_paths)", len(val_paths))
print("len(val_labels)", len(val_labels))

21000
21000
len(val_paths) 5250
len(val_labels) 5250


In [11]:
new_df = pd.DataFrame(columns = df.columns)
relevant_rows_df = pd.DataFrame(test_rows)
new_df = pd.concat([new_df, relevant_rows_df], ignore_index=True)

In [12]:
# print('.'.join(test_image_paths[0].split('/')[-1].split('.')[:-1]))
new_df['predictions'] = ''
new_df.to_csv('predicted_results.csv', sep=',', encoding='utf-8')

In [21]:
from tensorflow.keras.utils import Sequence
import cv2

class DataGenerator(Sequence):
    def __init__(self, image_paths, labels, batch_size=32, target_size=(256, 256), rescale=1./255, num_classes = 3, shuffle=True):
        self.image_paths = image_paths
        self.labels = labels
        self.batch_size = batch_size
        self.target_size = target_size
        self.rescale = rescale
        self.num_classes = num_classes
        self.shuffle = shuffle
        self.indices = np.arange(len(self.image_paths))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__data_generation(batch_indices)
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __data_generation(self, batch_indices):
        X = np.empty((len(batch_indices), *self.target_size, 3), dtype=np.float32)

        y = np.empty((len(batch_indices), self.num_classes), dtype=np.float32)  # Modify this according to how your labels are structured

        for i, idx in enumerate(batch_indices):
            image = cv2.imread(self.image_paths[idx])
            image = cv2.resize(image, self.target_size)
            image = image.astype('float32') * self.rescale
            X[i, ] = image
            y[i] = self.labels[idx]

        return X, y

In [None]:
num_folds = 3

kf = KFold(n_splits=num_folds, shuffle=True)

clear_session()

# base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Optionally, unfreeze the last few layers for fine-tuning
for layer in base_model.layers[-10:]:
    layer.trainable = True

# # Add the top layers
model = Sequential()
model.add(base_model)
model.add(GlobalAveragePooling2D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model for classification
model.compile(optimizer=Adam(learning_rate=0.00001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, min_lr=0.000001)

# Define callbacks
checkpoint_filepath = os.path.join('./checkpoints', f'vgg16_fold_final_checkpoint.keras')
checkpoint = ModelCheckpoint(
    checkpoint_filepath,
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    mode='min'
)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=8,
    verbose=1,
    mode='min'
)

# Create data generators for this fold
train_generator = DataGenerator(train_paths, train_labels, batch_size=32, target_size=(256, 256), rescale=1./255)
val_generator = DataGenerator(val_paths, val_labels, batch_size=32, target_size=(256, 256), rescale=1./255, shuffle=False)

# Train the model on the current fold
model.fit(
    train_generator,
    epochs=1,
    validation_data=val_generator,
    callbacks=[checkpoint, early_stopping, reduce_lr],
    batch_size=32,
    )

# Calculate training loss and accuracy for the current fold
train_loss, train_accuracy = model.evaluate(train_generator, verbose=1)

# Calculate validation loss and accuracy for the current fold
val_loss, val_accuracy = model.evaluate(val_generator, verbose=1)

model.save("checkpoints/vgg16_final.keras")

print(f"\nAverage Training Loss: {train_loss}")
print(f"Average Training Accuracy: {train_accuracy}")

print(f"\nAverage Validation Loss: {val_loss}")
print(f"Average Validation Accuracy: {val_accuracy}")

 78/493 [===>..........................] - ETA: 1:11:33 - loss: 1.1692 - accuracy: 0.3486

In [15]:
# prompt: Load and test the saved model using test_image_paths
predicted_df = pd.read_csv('predicted_results.csv')
predicted_df.columns

# Load the saved model
model = keras.models.load_model('checkpoints/vgg16_final.keras')

# Create a data generator for the test images
test_generator = DataGenerator(test_image_paths, test_labels, batch_size=32, target_size=(256, 256), rescale=1./255, shuffle=False)

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_generator, verbose=1)

# Print the results
print(f"\nAverage Test Loss: {test_loss}")
print(f"Average Test Accuracy: {test_accuracy}")

# Make predictions on the test data
test_predictions = model.predict(test_generator)

# Save the predictions to a CSV file
# predicted_ages = []
for i, test_image_path in enumerate(test_image_paths):
    file_id = test_image_path.split('/')[-1].split('.')[0]
    predicted_age = test_predictions[i][0]
    predicted_df.loc[df['ID1'] == file_id, 'predicted_age'] = predicted_age
    # predicted_ages.append({'ID1': file_id, 'predicted_age': predicted_age})

predicted_df.to_csv('predicted_results_1.csv', sep=',', encoding='utf-8', index=False)



NameError: name 'keras' is not defined