# Objective

1. Explore the Crater dataset and check the distributions
2. Identify anomalies which can be used to detect the craters.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import cv2

from PIL import Image

In [3]:
source = os.path.join('..', 'data')

train_imgs_path = os.path.join(source, 'train/images')
train_lbls_path = os.path.join(source, 'train/labels')

test_imgs_path = os.path.join(source, 'test/images')
test_lbls_path = os.path.join(source, 'test/labels')

val_imgs_path = os.path.join(source, 'valid/images')
val_lbls_path = os.path.join(source, 'valid/labels')

In [4]:
def load_labels(label_path):
    label_files = os.listdir(label_path)
    data = []
    classes = set()
    for file in label_files:
        with open(os.path.join(label_path, file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                box_info = list(map(float, line.strip().split()))
                data.append([file, *box_info])
                class_id = box_info[0]
                classes.add(int(class_id))
    df = pd.DataFrame(data, columns=['file', 
                                     'class_id', 
                                     'center_x',
                                     'center_y',
                                     'width',
                                     'height'
                                    ])
    return df, sorted(classes)

train_labels, train_classes = load_labels(train_lbls_path)
val_labels, val_classes = load_labels(val_lbls_path)
test_labels, test_classes = load_labels(test_lbls_path)

In [5]:
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   file      681 non-null    object 
 1   class_id  681 non-null    float64
 2   center_x  681 non-null    float64
 3   center_y  681 non-null    float64
 4   width     681 non-null    float64
 5   height    681 non-null    float64
dtypes: float64(5), object(1)
memory usage: 32.1+ KB


In [6]:
train_labels.head()

Unnamed: 0,file,class_id,center_x,center_y,width,height
0,mars_crater--110-_jpg.rf.593f6a3d9aed98e7a0895...,0.0,0.325781,0.416406,0.230469,0.354687
1,mars_crater--43-_jpg.rf.27ea7a65603205e491bc43...,0.0,0.384375,0.480469,0.054688,0.083594
2,mars_crater--43-_jpg.rf.27ea7a65603205e491bc43...,0.0,0.410938,0.0875,0.039844,0.057031
3,mars_crater--24-_jpg.rf.4614bb6844fda70ea8a3b4...,0.0,0.517969,0.499219,0.891406,0.998437
4,mars_crater--118-_jpg.rf.0a8b3fb0e1332e576901e...,0.0,0.675781,0.082031,0.040625,0.052344


# Load a sample image

In [8]:
## Open a sample image
image_name = os.listdir(train_imgs_path)[0]
img = cv2.imread(os.path.join(train_imgs_path, image_name))
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)

# Find the corresponding file
file, _ = os.path.splitext(os.listdir(train_imgs_path)[0])
file_name = file + ".txt"
rows = train_labels.loc[train_labels['file'] == file_name]

img_width, img_height, _ = img.shape
for i, row in rows.iterrows():
    x, y, h, w = row['center_x'], row['center_y'], row['width'], row['height']
    
    x_min = int((x - w / 2) * img_width)
    x_max = int((x + w / 2) * img_width)
    y_min = int((y - h / 2) * img_height)
    y_max = int((y + h / 2) * img_height)
    
    color = (0, 255, 0)
    thickness = 2
    img = cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, thickness)

    label = f"Crater {i}"
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    font_thickness = 1
    text_color = (255, 255, 255)
    cv2.putText(img, label, (x_min, y_min - 10), font, font_scale, text_color, font_thickness)
    
fig = px.imshow(img, title="A sample image with craters")
fig.update_layout(coloraxis_showscale=False)
fig.show()

# Bounding box area distribution

In [9]:
def plot_bb_dist(labels, title):
    labels['area'] = labels['width'] * labels['height']
    fig = px.histogram(labels, x='area', nbins=40, title=title)
    fig.show()

def plot_width_and_height_dist(labels):
    fig = px.histogram(labels, x='width', nbins=40, title='Width Distribution of the above data')
    fig2 = px.histogram(labels, x='height', nbins=40, title='Height Distribution of the above data')
    fig.show()
    fig2.show()
    

plot_bb_dist(train_labels, 'Train Bounding Box Area Distribution')
plot_width_and_height_dist(train_labels)
plot_bb_dist(val_labels, 'Validation Bounding Box Area Distribution')
plot_width_and_height_dist(val_labels)
plot_bb_dist(test_labels, 'Test Bounding Box Area Distribution')
plot_width_and_height_dist(test_labels)

# Aspect ratio distribution (width/height)

In [10]:
def plot_aspect_rt(labels):
    labels['ratio'] = labels['width'] / labels['height'] 
    fig = px.histogram(labels, x='ratio', nbins=40, title='Aspect Ratio Distribution')
    fig.show()

plot_aspect_rt(train_labels)
plot_aspect_rt(val_labels)
plot_aspect_rt(test_labels)

In [12]:
def extract_pixels(row: pd.Series, image_dir: str):
    """
    Extract pixel intensities from the bounding box (crater region) 
    and the background region of an image.

    Returns:
        tuple: Two flattened numpy arrays - (crater_pixels, background_pixels).
    """
    file_name, _  = os.path.splitext(row["file"])
    file_path = os.path.join(image_dir, file_name + ".jpg")
    img = Image.open(file_path).convert("L")  # Convert to grayscale
    
    img_width, img_height = img.size
    
    x_min = int((row["center_x"] - row["width"] / 2) * img_width)
    x_max = int((row["center_x"] + row["width"] / 2) * img_width)
    y_min = int((row["center_y"] - row["height"] / 2) * img_height)
    y_max = int((row["center_y"] + row["height"] / 2) * img_height)
    
    # Ensure bounding box is within image dimensions
    x_min, x_max = max(0, x_min), min(img_width, x_max)
    y_min, y_max = max(0, y_min), min(img_height, y_max)
    
    img_array = np.array(img)
    
    crater_pixels = img_array[y_min: y_max, x_min:x_max].flatten()
    
    mask = np.ones((img_height, img_width), dtype=bool)
    mask[y_min:y_max, x_min:x_max] = False
    
    background_pixels = img_array[mask]
    
    return crater_pixels, background_pixels

In [16]:
train_labels['pixels'] = train_labels.apply(lambda row: extract_pixels(row, train_imgs_path), axis=1)

train_labels["crater_pixels"] = train_labels["pixels"].apply(lambda x: x[0])
train_labels["background_pixels"] = train_labels["pixels"].apply(lambda x: x[1])

all_crater_pixels = np.concatenate(train_labels["crater_pixels"].values)
all_background_pixels = np.concatenate(train_labels["background_pixels"].values)

In [26]:
# Precompute bins and frequencies
crater_hist, bin_edges = np.histogram(all_crater_pixels, bins=50, range=(0, 255))
background_hist, _ = np.histogram(all_background_pixels, bins=50, range=(0, 255))

print("Crater histogram:", crater_hist)
print("Background histogram:", background_hist)

Crater histogram: [101655  65917  71305  56961  50377  54481  61623  69593  81068  94585
 128094 119639 132179 146335 161861 178745 199309 219566 241703 257797
 321322 280507 289898 301660 300136 291310 280768 268670 257904 243070
 276479 210906 192509 175251 157425 142731 129238 116873 104847  95955
 105443  80339  73843  64792  54285  43061  33779  28209  24814  39877]
Background histogram: [ 3522415   442939   432342   478438   556168   690235   854775  1101236
  1380980  1796195  2949983  3215541  3958137  4773535  5651613  6474871
  7395590  8539367  9662950 10955109 15049363 14045052 15639188 17171704
 17983661 17426655 15712248 13560081 11632766  9849378  9896538  6772709
  5629878  4660749  3874241  3170157  2626587  2190271  1768968  1436682
  1420595   975951   803459   659952   537615   436965   369839   334326
   344337   646572]


In [28]:
fig = go.Figure()
fig.add_trace(go.Bar(x=bin_edges[:-1], y=crater_hist, name="Crater Region", width=5))
fig.add_trace(go.Bar(x=bin_edges[:-1], y=background_hist, name="Background Region", width=5))

fig.update_layout(
    title="Pixel Intensity Distribution (Binned)",
    xaxis_title="Pixel Intensity",
    yaxis_title="Frequency",
    barmode="overlay"
)
fig.show()

## Texture analysis

1. HOG(Histogram of Oriented Gradients): Identifies edge directions and magnitudes
2. LBP(Local Binary Pattern): Encode the pattern