In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

# --- CONFIGURATION ---
# ⚠️ IMPORTANT: Change this to the path where ALL your fundus images are stored.
IMAGE_DIR = '/content/drive/MyDrive/thesis data/preprocessed_images'
CSV_FILE = '/content/drive/MyDrive/thesis data/full_df.csv'
IMG_SIZE = 224 # Standard size for ResNet50
BATCH_SIZE = 32
NUM_CLASSES = 3 # 0: Non-Myopia, 1: Low/Mild Myopia, 2: High/Pathological Myopia
EPOCHS = 10 # Start with 10, tune as needed

# --- END CONFIGURATION ---

# 1. Define Classification Mapping based on ODIR-5K Keywords
def classify_myopia(keywords):
    """Assigns a class index based on diagnostic keywords."""
    if pd.isna(keywords) or keywords == 'nan':
        return 0 # Default to Non-Myopia if keywords are missing

    keywords = keywords.lower()

    # Class 2: Strong/Very Strong Myopia (M2)
    if 'pathological myopia' in keywords or 'high myopia' in keywords:
        return 2

    # Class 1: Mild/Low Myopia (M1)
    if 'myopia' in keywords or 'refractive error' in keywords:
        return 1

    # Class 0: Non-Myopia (N)
    return 0

# 2. Data Preparation and Label Generation
print("Starting Data Preparation...")
df = pd.read_csv(CSV_FILE)

# Reshape the data to have one row per eye (Left and Right)
data_left = df[['Left-Fundus', 'Left-Diagnostic Keywords']].rename(
    columns={'Left-Fundus': 'filename', 'Left-Diagnostic Keywords': 'keywords'}
)
data_right = df[['Right-Fundus', 'Right-Diagnostic Keywords']].rename(
    columns={'Right-Fundus': 'filename', 'Right-Diagnostic Keywords': 'keywords'}
)

full_data = pd.concat([data_left, data_right], ignore_index=True)

# Generate the numerical class labels
full_data['myopia_class'] = full_data['keywords'].apply(classify_myopia)

# Filter out rows where image file is missing (e.g., placeholder in the CSV)
full_data = full_data[full_data['filename'].apply(lambda x: os.path.splitext(x)[1] in ['.jpg', '.png'])]

# Convert the label column to string for Keras FlowFromDataFrame
full_data['myopia_class_str'] = full_data['myopia_class'].astype(str)

print(f"Total labeled images: {len(full_data)}")
print(f"Myopia Class Distribution:\n{full_data['myopia_class_str'].value_counts()}")

# 3. Data Split
train_df, test_df = train_test_split(
    full_data,
    test_size=0.2, # 20% for testing (which will also include validation)
    stratify=full_data['myopia_class_str'],
    random_state=42
)
train_df, val_df = train_test_split(
    train_df,
    test_size=0.15, # 15% of the remaining data for validation (~12% of total)
    stratify=train_df['myopia_class_str'],
    random_state=42
)

print(f"Train/Validation/Test Split: {len(train_df)} / {len(val_df)} / {len(test_df)}")


# 4. Image Loading and Preprocessing (Data Generators)
# Data augmentation for training to prevent overfitting
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Only rescaling (normalization) for validation and test sets
val_test_datagen = ImageDataGenerator(rescale=1./255)

# Create Generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=IMAGE_DIR,
    x_col='filename',
    y_col='myopia_class_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

validation_generator = val_test_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=IMAGE_DIR,
    x_col='filename',
    y_col='myopia_class_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

# Test generator (do not shuffle)
test_generator = val_test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=IMAGE_DIR,
    x_col='filename',
    y_col='myopia_class_str',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

# 5. Model Building (Transfer Learning with ResNet50)
print("\nBuilding CNN Model...")
# Load ResNet50 pre-trained on ImageNet weights
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))

# Freeze the convolutional layers
for layer in base_model.layers:
    layer.trainable = False

# Add custom classification layers (the 'head')
x = base_model.output
x = GlobalAveragePooling2D()(x) # Reduces spatial dimensions
x = Dense(1024, activation='relu')(x)
predictions = Dense(NUM_CLASSES, activation='softmax')(x) # Final output layer for 3 classes

# Define the final model
model = Model(inputs=base_model.input, outputs=predictions)

# 6. Training and Evaluation
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
print("\nStarting Model Training...")
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // BATCH_SIZE
)

# Evaluate on the Test Set
print("\nEvaluating Model on Test Set...")
test_loss, test_accuracy = model.evaluate(test_generator, steps=test_generator.samples // BATCH_SIZE)

print(f"\nFinal Test Set Accuracy: {test_accuracy*100:.2f}%")

# Generate detailed classification report
test_generator.reset()
Y_pred = model.predict(test_generator, steps=test_generator.samples // BATCH_SIZE + 1)
y_pred_classes = np.argmax(Y_pred, axis=1)

# Map numeric predictions back to class names
class_labels = list(test_generator.class_indices.keys())
# Fixed line:
y_true_indices = test_generator.classes

# Match the length of true labels to predictions (this is complex due to the generator size)
# For simplicity, we limit the comparison to the number of predictions made
y_true_limited = y_true_indices[:len(y_pred_classes)]

# Calculate final accuracy and report
final_accuracy = accuracy_score(y_true_limited, y_pred_classes)
report = classification_report(y_true_limited, y_pred_classes, target_names=[f'Class {c}' for c in class_labels])
conf_matrix = confusion_matrix(y_true_limited, y_pred_classes)

print("\n--- Detailed Results ---")
print(f"Final Model Classification Accuracy: {final_accuracy*100:.2f}%")
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix (True Labels vs. Predicted Labels):\n", conf_matrix)

Starting Data Preparation...
Total labeled images: 12784
Myopia Class Distribution:
myopia_class_str
0    12305
2      457
1       22
Name: count, dtype: int64
Train/Validation/Test Split: 8692 / 1535 / 2557
Found 8444 validated image filenames belonging to 3 classes.




Found 1487 validated image filenames belonging to 3 classes.




Found 2481 validated image filenames belonging to 3 classes.





Building CNN Model...

Starting Model Training...


  self._warn_if_super_not_called()


Epoch 1/10
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 579ms/step - accuracy: 0.9378 - loss: 0.2689 - val_accuracy: 0.9606 - val_loss: 0.1682
Epoch 2/10
[1m  1/263[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 83ms/step - accuracy: 0.9688 - loss: 0.1411



[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.9688 - loss: 0.1411 - val_accuracy: 0.9606 - val_loss: 0.1702
Epoch 3/10
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 548ms/step - accuracy: 0.9631 - loss: 0.1676 - val_accuracy: 0.9613 - val_loss: 0.1675
Epoch 4/10
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.9688 - loss: 0.1367 - val_accuracy: 0.9613 - val_loss: 0.1653
Epoch 5/10
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 556ms/step - accuracy: 0.9629 - loss: 0.1647 - val_accuracy: 0.9613 - val_loss: 0.1622
Epoch 6/10
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.9688 - loss: 0.1237 - val_accuracy: 0.9613 - val_loss: 0.1643
Epoch 7/10
[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 554ms/step - accuracy: 0.9646 - loss: 0.1552 - val_accuracy: 0.9613 - val_loss: 0.1532
Epoch 8/10
[1m263/263[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
