<a href="https://colab.research.google.com/github/RonnyMuthomi/Auth/blob/main/GalaxyZoo1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import mixed_precision



In [9]:
from google.colab import files
import os
import zipfile

# Upload the files
uploaded = files.upload()

# Extract the training images
zip_path = "/content/images_training_rev1.zip"
extract_path = "/content/my_galaxy_dataset/images"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Images extracted successfully!")

Saving training_solutions_rev1.zip to training_solutions_rev1.zip
Images extracted successfully!


In [19]:
import os

# List the contents of the dataset folder
print("Contents of /content/my_galaxy_dataset:")
print(os.listdir("/content/my_galaxy_dataset"))

# Rename the CSV file to metadata.csv
!mv /content/my_galaxy_dataset/training_solutions_rev1.csv /content/my_galaxy_dataset/metadata.csv

# Verify the final structure
print("\nFinal contents of /content/my_galaxy_dataset:")
print(os.listdir("/content/my_galaxy_dataset"))

# List the contents of the images directory
print("\nContents of /content/my_galaxy_dataset/images:")
print(os.listdir("/content/my_galaxy_dataset/images"))

In [16]:
import os

# List the contents of the dataset folder
print(os.listdir("/content/my_galaxy_dataset"))

['training_solutions_rev1.csv', 'images']


Contents of /content/my_galaxy_dataset:
['training_solutions_rev1.csv', 'images']

Final contents of /content/my_galaxy_dataset:
['images', 'metadata.csv']

Contents of /content/my_galaxy_dataset/images:
['images_training_rev1']


In [18]:

# Load the metadata
metadata_path = "/content/my_galaxy_dataset/metadata.csv"
metadata = pd.read_csv(metadata_path)

# Display the first few rows
print(metadata.head())

   GalaxyID  Class1.1  Class1.2  Class1.3  Class2.1  Class2.2  Class3.1  \
0    100008  0.383147  0.616853  0.000000  0.000000  0.616853  0.038452   
1    100023  0.327001  0.663777  0.009222  0.031178  0.632599  0.467370   
2    100053  0.765717  0.177352  0.056931  0.000000  0.177352  0.000000   
3    100078  0.693377  0.238564  0.068059  0.000000  0.238564  0.109493   
4    100090  0.933839  0.000000  0.066161  0.000000  0.000000  0.000000   

   Class3.2  Class4.1  Class4.2  ...  Class9.3  Class10.1  Class10.2  \
0  0.578401  0.418398  0.198455  ...  0.000000   0.279952   0.138445   
1  0.165229  0.591328  0.041271  ...  0.018764   0.000000   0.131378   
2  0.177352  0.000000  0.177352  ...  0.000000   0.000000   0.000000   
3  0.129071  0.189098  0.049466  ...  0.000000   0.094549   0.000000   
4  0.000000  0.000000  0.000000  ...  0.000000   0.000000   0.000000   

   Class10.3  Class11.1  Class11.2  Class11.3  Class11.4  Class11.5  Class11.6  
0   0.000000   0.000000   0.092886 

In [20]:
# Step 4: Verify GPU availability and enable mixed precision
print("GPU Available:",
len(tf.config.list_physical_devices('GPU')) > 0)

GPU Available: False


In [21]:
# Enable mixed precision for faster training on GPUs
policy = mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)
print("Mixed precision enabled:", policy)

Mixed precision enabled: <DTypePolicy "mixed_float16">


In [22]:
# Step 5: Set the path to the dataset
path_to_dataset = "/content/my_galaxy_dataset"

In [23]:
# Step 6: Load metadata
metadata = pd.read_csv(f"{path_to_dataset}/metadata.csv")

In [34]:
# Create the 'filename' column
metadata['filename'] = metadata['GalaxyID'].astype(str) + '.jpg'

# Display the updated DataFrame
print(metadata[['GalaxyID', 'filename']].head())

   GalaxyID    filename
0    100008  100008.jpg
1    100023  100023.jpg
2    100053  100053.jpg
3    100078  100078.jpg
4    100090  100090.jpg


In [42]:
# List the first 5 image files in the subfolder
subfolder_path = "/content/my_galaxy_dataset/images/images_training_rev1"
image_files = os.listdir(subfolder_path)
print(image_files[:5])

['113692.jpg', '251280.jpg', '403430.jpg', '265661.jpg', '725080.jpg']


In [43]:
# Update the 'filename' column to include the subfolder path
metadata['filename'] = "/content/my_galaxy_dataset/images/images_training_rev1/" + metadata['GalaxyID'].astype(str) + '.jpg'

# Verify the updated filenames
print(metadata['filename'].head())

0    /content/my_galaxy_dataset/images/images_train...
1    /content/my_galaxy_dataset/images/images_train...
2    /content/my_galaxy_dataset/images/images_train...
3    /content/my_galaxy_dataset/images/images_train...
4    /content/my_galaxy_dataset/images/images_train...
Name: filename, dtype: object


In [57]:

# Load the metadata
metadata_path = "/content/my_galaxy_dataset/metadata.csv"
metadata = pd.read_csv(metadata_path)

# Update the 'filename' column to include the subfolder path
metadata['filename'] = "/content/my_galaxy_dataset/images/images_training_rev1/" + metadata['GalaxyID'].astype(str) + '.jpg'


# Split the data into training and validation sets
train_df, val_df = train_test_split(metadata, test_size=0.2, random_state=42)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)

# Add a new column with categorical labels
threshold = 0.5  # Adjust the threshold as needed
train_df['label'] = train_df['Class1.1'].apply(lambda x: 'spiral' if x > threshold else 'elliptical')


train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=None,  # Set to None since filenames contain full paths
    x_col="filename",
    y_col="label",  # Use the new 'label' column
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'  # Use 'categorical' for classification
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=None,  # Set to None since filenames contain full paths
    x_col="filename",
    y_col="Class1.1",  # Replace with the appropriate label column
    target_size=(128, 128),
    batch_size=32,
    class_mode='raw'  # Use 'raw' for regression or 'categorical' for classification
)

# Verify the fix
print(f"Found {train_generator.samples} validated image filenames.")

Found 49262 validated image filenames belonging to 2 classes.
Found 12316 validated image filenames.
Found 49262 validated image filenames.


In [58]:
# Get the number of classes
num_classes = len(train_generator.class_indices)









In [59]:
# Step 9: Build the model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer='adam',
loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [60]:
# Step 10: Define the checkpoint callback
checkpoint_path ='/content/drive/MyDrive/checkpoints/galaxy_model_checkpoint.h5'
os.makedirs('/content/drive/MyDrive/checkpoints',
exist_ok=True) # Ensure directory exists

checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, mode='max', verbose=1
)

In [None]:
# Step 11: Train the model with checkpoints
history = model.fit(train_generator, steps_per_epoch=train_generator.samples // train_generator.batch_size,
                    validation_data=val_generator, validation_steps=val_generator.samples // val_generator.batch_size,
                    epochs=10, callbacks=[checkpoint])

  self._warn_if_super_not_called()


Epoch 1/10
[1m 271/1539[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:05:40[0m 3s/step - accuracy: 0.5461 - loss: 0.7110