In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import VGG16
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Path to the directory containing your CSV file
base_dir = 'C:\\Users\\LENOVO\\OneDrive\\Desktop'
csv_path = os.path.join(base_dir, 'train_data.csv')

# Load dataset CSV
train_df = pd.read_csv(csv_path)

# Path to the directories containing images
image_dir_benign = 'C:\\Users\\LENOVO\\OneDrive\\Desktop\\breast-cancer-detection-challenge\\data\\train\\0'
image_dir_malignant = 'C:\\Users\\LENOVO\\OneDrive\\Desktop\\breast-cancer-detection-challenge\\data\\train\\1'

# Function to load images from both folders with debug prints
def load_images(df, dir_benign, dir_malignant, target_size=(224, 224)):
    images = []
    labels = df['label'].values
    for img_name in df['file_name']:
        img_path_benign = os.path.join(dir_benign, img_name)
        img_path_malignant = os.path.join(dir_malignant, img_name)
        if os.path.isfile(img_path_benign):
            img_path = img_path_benign
        elif os.path.isfile(img_path_malignant):
            img_path = img_path_malignant
        else:
            print(f'File not found: {img_name}')
            continue
        print(f'Loading: {img_path}')  # Debug print
        img = load_img(img_path, target_size=target_size)
        img_array = img_to_array(img)
        images.append(img_array)
    return np.array(images), labels

# Load images
train_images, train_labels = load_images(train_df, image_dir_benign, image_dir_malignant)
print(f'Loaded {len(train_images)} images.')

# Proceed if images are successfully loaded
if len(train_images) > 0:
    # Normalize images
    train_images = train_images / 255.0

    # Convert labels to one-hot encoding
    train_labels = to_categorical(train_labels)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(train_images, train_labels, test_size=0.2, random_state=42)

    # ImageDataGenerator for Data Augmentation
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    test_datagen = ImageDataGenerator(rescale=1./255)

    # Fit the data generator
    train_generator = train_datagen.flow(X_train, y_train, batch_size=32)
    validation_generator = test_datagen.flow(X_test, y_test, batch_size=32)

    # Load the VGG16 model
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    # Add custom layers on top
    x = base_model.output
    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(2, activation='softmax')(x)

    # Define the model
    model = Model(inputs=base_model.input, outputs=predictions)

    # Freeze the base model layers
    for layer in base_model.layers:
        layer.trainable = False

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Callbacks for early stopping and reducing learning rate
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)

    # Train the model
    history = model.fit(train_generator, epochs=10, validation_data=validation_generator,
                        callbacks=[early_stopping, reduce_lr])

    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(validation_generator)
    print(f'Loss: {loss}')
    print(f'Accuracy: {accuracy}')
else:
    print('No images loaded. Please check your image directory and file paths.')
# Path to the new test CSV file
test_csv_path = os.path.join(base_dir, 'test_data.csv')

# Load the test dataset CSV
test_df = pd.read_csv(test_csv_path)

# Function to load and preprocess images for the test set
def load_test_images(df, dir_benign, dir_malignant, target_size=(224, 224)):
    images = []
    filenames = df['file_name'].values
    for img_name in filenames:
        img_path_benign = os.path.join(dir_benign, img_name)
        img_path_malignant = os.path.join(dir_malignant, img_name)
        if os.path.isfile(img_path_benign):
            img_path = img_path_benign
        elif os.path.isfile(img_path_malignant):
            img_path = img_path_malignant
        else:
            print(f'File not found: {img_name}')
            continue
        print(f'Loading: {img_path}')  # Debug print
        img = load_img(img_path, target_size=target_size)
        img_array = img_to_array(img)
        images.append(img_array)
    return np.array(images), filenames

# Load test images
test_images, test_filenames = load_test_images(test_df, image_dir_benign, image_dir_malignant)

# Normalize test images
test_images = test_images / 255.0

# Predict labels for the test set
test_predictions = model.predict(test_images)
predicted_labels = np.argmax(test_predictions, axis=1)

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'file_name': test_filenames,
    'label': predicted_labels
})

# Path to the submission CSV file
submission_csv_path = os.path.join(base_dir, 'submission.csv')

# Save the submission DataFrame to a CSV file
submission_df.to_csv(submission_csv_path, index=False)

print(f'Submission file saved to {submission_csv_path}')
