In [None]:
# Project 3
# Oneza Vhora and Shalini Sundar

#Project Description: You are given a dataset which contains satellite images from Texas after Hurricane Harvey. There are damaged and non-damaged building images organized into respective folders.
#Your goal is to build multiple neural networks based on different architectures to classify images as containing buildings that are either damaged or not damaged. You will evaluate each of the networks you develop and produce and select the “best” network to “deploy”. Note that this is a binary classification problem, where the goal it to classify whether the structure in the image has damage or does not have damage.
#Part 1: (3 points) Data preprocessing and visualization
#You will need to perform data analysis and pre-processing to prepare the images for training. At a minimum, you should:
#Write code to load the data into Python data structures
#Investigate the datasets to determine basic attributes of the images
#Ensure data is split for training, validation and testing and perform any additional preprocessing (e.g., rescaling, normalization, etc.) so that it can be used for training/evaluation of the neural networks you will build in Part 2.
#Part 2: (10 points) Model design, training and evaluation
#You will explore different model architectures that we have seen in class, including:
#A dense (i.e., fully connected) ANN
#The Lenet-5 CNN architecture
#Alternate-Lenet-5 CNN architecture, described in the following paper (Table 1, Page 12 of the research paper https://arxiv.org/pdf/1807.01688.pdf, but note that the dataset is not the same as that analyzed in the paper.)
#You are free to experiment with different variants on all three architectures above. For example, for the fully connected ANN, feel free to experiment with different numbers of layers and perceptrons. Train and evaluate each model you build,and select the “best” performing model.
#Note that the input and output dimensions are fixed, as the inputs (images) and the outputs (labels) have been given. These have important implications for your architecture. Make sure you understand the constraints these impose before beginning to design and implement your networks. Failure to implement these correctly will lead to incorrect architectures and significant penalty on the project grade.

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np
import shutil
import os
from pathlib import Path
import random

# Define the base directory for the dataset
base_dir = 'coe379L-sp25/datasets/unit03/Project3'

# Verify the directory structure
damage_dir = os.path.join(base_dir, 'damage')
no_damaged_dir = os.path.join(base_dir, 'no damage')

# Make sure directories are clean before running the code
try :
    shutil.rmtree('train')
    shutil.rmtree('test')
except:
    pass

# Create directories for training and testing. Ensure directories exist.
Path('train').mkdir(parents=True, exist_ok=True)
Path('test').mkdir(parents=True, exist_ok=True)

# Split data into training and testing sets
def filter_images_files(base_dir):
    """Filter out non-image files from the dataset."""
    all_files = os.listdir(base_dir)
    return [f for f in all_files if f.endswith(('.jpg', '.jpeg', '.png'))]  

# Get all image files from damaged and undamaged directories
damage_files = filter_images_files(damage_dir)
no_damage_files = filter_images_files(no_damaged_dir)

# Combine all image files and shuffle them
all_files = [os.path.join(damage_dir, f) for f in damage_files] + \
    [os.path.join(no_damaged_dir, f) for f in no_damage_files]
random.shuffle(all_files)

#split
split_idx = int(len(all_files) * 0.8)
train_files = all_files[:split_idx]
test_files = all_files[split_idx:]

# copy files to train and test directories
for f in train_files:
    shutil.copy(f, 'train/')
for f in test_files:
    shutil.copy(f, 'test/')

# log the number of images in each directory
print(f"Total images: {sum(len(all_files))}")
print(f"Training images: {sum(len(train_files))}")
print(f"Testing images: {sum(len(test_files))}")    

FileNotFoundError: [Errno 2] No such file or directory: 'coe379L-sp25/datasets/unit03/Project3/no damage'

In [17]:
# Preprocess data
datagen = ImageDataGenerator(rescale=1.0/255, validation_split=0.2)

train_generator = datagen.flow_from_directory(
    "train",
    target_size=(150, 150),
    batch_size=32,
    class_mode="binary",
    subset="training"
)

validation_generator = datagen.flow_from_directory(
    "train",
    target_size=(150, 150),
    batch_size=32,
    class_mode="binary",
    subset="validation"
)

test_datagen = ImageDataGenerator(rescale=1.0/255)

test_generator = test_datagen.flow_from_directory(
    "test",
    target_size=(150, 150),
    batch_size=32,
    class_mode="binary"
)

Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.


In [None]:
# Define the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Single output neuron for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=10,  # Adjust the number of epochs as needed
    steps_per_epoch=len(train_generator),
    validation_steps=len(validation_generator)
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")