<a href="https://colab.research.google.com/github/RonnyMuthomi/Auth/blob/main/GalaxyZoo12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import mixed_precision



In [None]:
from google.colab import drive
import os
import zipfile

# Mount Google Drive
drive.mount('/content/drive')

# Define paths in Google Drive
drive_dataset_path = "/content/drive/MyDrive/Attach/my_galaxy_dataset"
drive_images_zip_path = "/content/drive/MyDrive/Attach/images_training_rev1.zip"
drive_solutions_zip_path = "/content/drive/MyDrive/Attach/training_solutions_rev1.zip"

# Create the dataset folder in Google Drive
os.makedirs(drive_dataset_path, exist_ok=True)
os.makedirs(os.path.join(drive_dataset_path, "images"), exist_ok=True)

# Extract the images
with zipfile.ZipFile(drive_images_zip_path, 'r') as zip_ref:
    zip_ref.extractall(os.path.join(drive_dataset_path, "images"))

# Extract the training solutions
with zipfile.ZipFile(drive_solutions_zip_path, 'r') as zip_ref:
    zip_ref.extractall(drive_dataset_path)

# Locate the CSV file and rename it to metadata.csv
csv_path = os.path.join(drive_dataset_path, "training_solutions_rev1.csv")
if os.path.exists(csv_path):
    os.rename(csv_path, os.path.join(drive_dataset_path, "metadata.csv"))
else:
    # If the CSV file is inside a subfolder (e.g., training_solutions_rev1)
    subfolder_path = os.path.join(drive_dataset_path, "training_solutions_rev1")
    if os.path.exists(subfolder_path):
        csv_path = os.path.join(subfolder_path, "training_solutions_rev1.csv")
        if os.path.exists(csv_path):
            os.rename(csv_path, os.path.join(drive_dataset_path, "metadata.csv"))
        else:
            print("Error: Could not find training_solutions_rev1.csv in the subfolder.")
    else:
        print("Error: Could not find training_solutions_rev1.csv or its subfolder.")

print("Files extracted and metadata.csv created successfully!")

# Verify the final structure
print("\nFinal contents of /content/drive/MyDrive/Attach/my_galaxy_dataset:")
print(os.listdir(drive_dataset_path))

print("\nContents of /content/drive/MyDrive/Attach/my_galaxy_dataset/images:")
print(os.listdir(os.path.join(drive_dataset_path, "images")))

'/content/training_solutions_rev1.zip'

Files extracted and metadata.csv created successfully!

Final contents of /content/my_galaxy_dataset:
['images', 'metadata.csv']

Contents of /content/my_galaxy_dataset/images:
['images_training_rev1']


In [10]:

# Load the metadata
metadata_path = "/content/my_galaxy_dataset/metadata.csv"
metadata = pd.read_csv(metadata_path)

# Display the first few rows
print(metadata.head())

   GalaxyID  Class1.1  Class1.2  Class1.3  Class2.1  Class2.2  Class3.1  \
0    100008  0.383147  0.616853  0.000000  0.000000  0.616853  0.038452   
1    100023  0.327001  0.663777  0.009222  0.031178  0.632599  0.467370   
2    100053  0.765717  0.177352  0.056931  0.000000  0.177352  0.000000   
3    100078  0.693377  0.238564  0.068059  0.000000  0.238564  0.109493   
4    100090  0.933839  0.000000  0.066161  0.000000  0.000000  0.000000   

   Class3.2  Class4.1  Class4.2  ...  Class9.3  Class10.1  Class10.2  \
0  0.578401  0.418398  0.198455  ...  0.000000   0.279952   0.138445   
1  0.165229  0.591328  0.041271  ...  0.018764   0.000000   0.131378   
2  0.177352  0.000000  0.177352  ...  0.000000   0.000000   0.000000   
3  0.129071  0.189098  0.049466  ...  0.000000   0.094549   0.000000   
4  0.000000  0.000000  0.000000  ...  0.000000   0.000000   0.000000   

   Class10.3  Class11.1  Class11.2  Class11.3  Class11.4  Class11.5  Class11.6  
0   0.000000   0.000000   0.092886 

In [11]:
# Step 4: Verify GPU availability and enable mixed precision
print("GPU Available:",
len(tf.config.list_physical_devices('GPU')) > 0)

GPU Available: False


In [12]:
# Enable mixed precision for faster training on GPUs
policy = mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print("Mixed precision enabled:", policy)

Mixed precision enabled: <DTypePolicy "mixed_float16">


In [13]:
# Step 5: Set the path to the dataset
path_to_dataset = "/content/my_galaxy_dataset"

In [14]:
# Step 6: Load metadata
metadata = pd.read_csv(f"{path_to_dataset}/metadata.csv")

In [15]:
# Create the 'filename' column
metadata['filename'] = metadata['GalaxyID'].astype(str) + '.jpg'

# Display the updated DataFrame
print(metadata[['GalaxyID', 'filename']].head())

   GalaxyID    filename
0    100008  100008.jpg
1    100023  100023.jpg
2    100053  100053.jpg
3    100078  100078.jpg
4    100090  100090.jpg


In [16]:
# List the first 5 image files in the subfolder
subfolder_path = "/content/my_galaxy_dataset/images/images_training_rev1"
image_files = os.listdir(subfolder_path)
print(image_files[:5])

['113692.jpg', '251280.jpg', '403430.jpg', '265661.jpg', '725080.jpg']


In [17]:
# Update the 'filename' column to include the subfolder path
metadata['filename'] = "/content/my_galaxy_dataset/images/images_training_rev1/" + metadata['GalaxyID'].astype(str) + '.jpg'

# Verify the updated filenames
print(metadata['filename'].head())

0    /content/my_galaxy_dataset/images/images_train...
1    /content/my_galaxy_dataset/images/images_train...
2    /content/my_galaxy_dataset/images/images_train...
3    /content/my_galaxy_dataset/images/images_train...
4    /content/my_galaxy_dataset/images/images_train...
Name: filename, dtype: object


In [None]:
# Split the data into training and validation sets
train_df, val_df = train_test_split(metadata, test_size=0.2, random_state=42)



# Update the label column to categorical format
train_df['label'] = train_df['Class1.1'].apply(lambda x: 1 if x > threshold else 0)  # Binary labels (0 or 1)
val_df['label'] = val_df['Class1.1'].apply(lambda x: 1 if x > threshold else 0)  # Binary labels (0 or 1)

# Ensure the label column is categorical
train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)


# Update the generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=None,  # Set to None since filenames contain full paths
    x_col="filename",
    y_col="label",  # Use the 'label' column
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'  # Use 'categorical' for classification
)

# Get the number of classes
num_classes = len(train_generator.class_indices)

val_datagen = ImageDataGenerator(rescale=1./255)


val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=None,  # Set to None since filenames contain full paths
    x_col="filename",
    y_col="label",  # Use the 'label' column
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'  # Use 'categorical' for classification
)

# Verify the generators
print(f"Found {train_generator.samples} training samples.")
print(f"Found {val_generator.samples} validation samples.")
print(f"Class indices: {train_generator.class_indices}")

# Build the model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)  # num_classes should be 2 for binary classification

model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


# Step 10: Define the checkpoint callback
checkpoint_path ='/content/drive/MyDrive/checkpoints/galaxy_model_checkpoint.h5'
os.makedirs('/content/drive/MyDrive/checkpoints',
exist_ok=True) # Ensure directory exists

checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, mode='max', verbose=1
)

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=val_generator,
    validation_steps=val_generator.samples // val_generator.batch_size,
    epochs=10,
    callbacks=[checkpoint]
)

Found 49262 validated image filenames belonging to 2 classes.
Found 12316 validated image filenames belonging to 2 classes.
Found 49262 training samples.
Found 12316 validation samples.
Class indices: {'0': 0, '1': 1}


  self._warn_if_super_not_called()


Epoch 1/10
[1m1539/1539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5454 - loss: 0.6992
Epoch 1: val_accuracy improved from -inf to 0.58626, saving model to /content/drive/MyDrive/checkpoints/galaxy_model_checkpoint.h5




[1m1539/1539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3913s[0m 3s/step - accuracy: 0.5454 - loss: 0.6992 - val_accuracy: 0.5863 - val_loss: 0.6778
Epoch 2/10
[1m   1/1539[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m56:29[0m 2s/step - accuracy: 0.5000 - loss: 0.6974




Epoch 2: val_accuracy improved from 0.58626 to 0.58634, saving model to /content/drive/MyDrive/checkpoints/galaxy_model_checkpoint.h5




[1m1539/1539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m822s[0m 533ms/step - accuracy: 0.5000 - loss: 0.6974 - val_accuracy: 0.5863 - val_loss: 0.6778
Epoch 3/10
[1m 509/1539[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m35:06[0m 2s/step - accuracy: 0.5809 - loss: 0.6820

In [None]:
# Step 12: Evaluate the model
loss, accuracy = model.evaluate(val_generator)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")

In [None]:
# Step 13: Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# Step 14: Save the final model
model.save('/content/drive/MyDrive/galaxy_classification_model.h5')

In [None]:
# Step 15: Make predictions on a sample image
from tensorflow.keras.preprocessing import image

img_path = f"{path_to_dataset}/images/image1.jpg" # Replace with an actual filename
if os.path.exists(img_path):
  img = image.load_img(img_path, target_size=(128, 128))
  img_array = image.img_to_array(img)
  img_array = np.expand_dims(img_array, axis=0)
  img_array /= 255.0

  prediction = model.predict(img_array)
  predicted_class = np.argmax(prediction, axis=1)

  class_labels = list(train_generator.class_indices.keys())
  print(f"Predicted Class:{class_labels[predicted_class[0]]}")
else:
  print(f"Error: The file {img_path} does not exist.")

Found 49262 validated image filenames belonging to 2 classes.


TypeError: If class_mode="categorical", y_col="Class1.1" column values must be type string, list or tuple.

Found 49262 validated image filenames belonging to 2 classes.
Found 12316 validated image filenames.
Found 49262 validated image filenames.


  self._warn_if_super_not_called()


Epoch 1/10
[1m1539/1539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5553 - loss: 0.6983

ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 2)

ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 2)