In [None]:
# DATASET INSPECTION

import pandas as pd              # For metadata inspection and manipulation
import matplotlib.pyplot as plt  # For visualizing data distribution and sample images
import seaborn as sns            # For visualizing class distribution
from PIL import Image            # For loading and inspecting images (or use OpenCV if you prefer)
import os                        # For navigating the file structure

# Read metadata from dataset    
metadata_path = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_metadata.csv'
metadata = pd.read_csv(metadata_path)

# Inspect the first few rows
print("##### Inspect the first few rows #####")
print(metadata.head())
print("\n")

# Check for missing values
print("##### Missing values #####")
print(metadata.isnull().sum())
print("\n")

# Filing after missing values
print("##### Filling missing values #####")
metadata['age'].fillna(metadata['age'].mean(), inplace=True)
print(metadata.isnull().sum())
print("\n")

# Check duplicates
image_dir_1 = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_images_part_1'
image_dir_2 = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_images_part_2'
print("##### Checking duplicates #####")
# Check for duplicate entries in metadata
print(f"Duplicates in metadata: {metadata.duplicated().sum()}")
# Verify that all images listed in the metadata are present in the folders
image_ids = set(metadata['image_id'])
all_image_files = set(os.listdir(image_dir_1) + os.listdir(image_dir_2))
# Check if any image_id in metadata is missing in image files
missing_images = [img_id for img_id in image_ids if f"{img_id}.jpg" not in all_image_files]
print(f"Missing images: {missing_images}")
print("\n")

# Basic statistics of the dataset
print("##### statistics of the dataset #####")
print(metadata.describe())
print("\n")

# Visualize the distribution of diseases
print("##### Displayed distribution of diseases #####")
sns.countplot(data=metadata, x='dx')
plt.title("Distribution of Skin Diseases in HAM10000 Dataset")
plt.xticks(rotation=45)
plt.show()

# Visualize the distribution of lesion types (dx)
print("##### Visualize the distribution of lesion types (dx) #####")
sns.countplot(data=metadata, x='dx', order=metadata['dx'].value_counts().index)
plt.title('Distribution of Lesion Types')
plt.xticks(rotation=45)
plt.show()


# Image Inspection
print("##### Displayed sample images #####")

def display_sample_images(image_dir, num_samples=5):
    sample_files = os.listdir(image_dir)[:num_samples]
    plt.figure(figsize=(10,5))
    for i, filename in enumerate(sample_files):
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path)
        plt.subplot(1, num_samples, i+1)
        plt.imshow(image)
        plt.axis('off')
    plt.show()   
display_sample_images(image_dir_1)
display_sample_images(image_dir_2)
print("\n")

