1. Load and Explore the Dataset

In [2]:
import pandas as pd

# Load the dataset
train_df = pd.read_csv('train.csv')

# Display the first few rows of the dataframe
print(train_df.head())

# Print class distribution
print(train_df['Class'].value_counts())


  File Name        Class
0     1.jpg        other
1     2.jpg  bright dune
2     3.jpg        other
3     4.jpg       crater
4     5.jpg        other
Class
other            3651
crater           1062
bright dune       597
slope streak      335
swiss cheese      223
dark dune         216
spider             66
impact ejecta      51
Name: count, dtype: int64


2. Preprocess the Images

In [3]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# Directory paths (update with actual paths)
train_dir = 'train_dataset'  # Update with the actual path
test_dir = 'test_dataset'    # Update with the actual path

# Parameters
image_size = (227, 227)  # Standard size for all images

# Function to preprocess images
def preprocess_images(directory, image_size):
    images = []
    filenames = []
    for filename in tqdm(os.listdir(directory)):
        filepath = os.path.join(directory, filename)
        image = cv2.imread(filepath)
        if image is not None:
            image = cv2.resize(image, image_size)
            image = image / 255.0  # Normalize pixel values
            images.append(image)
            filenames.append(filename)
    return np.array(images), filenames

# Preprocess train and test images
train_images, train_filenames = preprocess_images(train_dir, image_size)
test_images, test_filenames = preprocess_images(test_dir, image_size)

100%|██████████| 6201/6201 [01:05<00:00, 94.61it/s] 
100%|██████████| 2000/2000 [00:20<00:00, 97.93it/s] 


3. Handle Class Imbalance

In [4]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Prepare labels
train_labels = train_df['Class'].astype('category').cat.codes

4. Split the Data into Training and Validation Sets

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_images, val_images, train_labels, val_labels = train_test_split(
    train_images, train_labels, test_size=0.2, random_state=42)

# Create data generators for training and validation sets
train_generator = datagen.flow(train_images, train_labels, batch_size=32)
validation_generator = datagen.flow(val_images, val_labels, batch_size=32)

5. Build and Train the Model

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.applications import VGG16

# Load the pre-trained model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the base model
base_model.trainable = False

# Build the model
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(512, activation='relu'),
    Dense(8, activation='softmax')  # 8 classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_generator, epochs=10, validation_data=validation_generator)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
model.save('model_1')



INFO:tensorflow:Assets written to: model_1\assets


INFO:tensorflow:Assets written to: model_1\assets


6. Evaluate the Model

In [8]:
# Predict the labels of the validation set
val_predictions = model.predict(val_images)
val_pred_labels = np.argmax(val_predictions, axis=1)

# Calculate accuracy
accuracy = np.sum(val_labels == val_pred_labels) / len(val_labels)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Validation Accuracy: 59.55%


In [9]:
# Create a DataFrame to store filenames and their predicted classes
results_df = pd.DataFrame({
    'FileName': test_filenames,
    'PredictedClass': val_pred_labels
})

# Save the DataFrame to a CSV file
results_df.to_csv('test_predictions.csv', index=False)

print("Test predictions saved to 'test_predictions.csv'")


ValueError: All arrays must be of the same length