In [None]:
# DATASET INSPECTION
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os

# Define metadata path and image directories
metadata_path = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_metadata.csv'
image_dir_1 = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_images_part_1'
image_dir_2 = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_images_part_2'

# Load metadata
metadata = pd.read_csv(metadata_path)

# Inspect the first few rows
print("##### Inspect the first few rows #####")
print(metadata.head())
print("\n")

# Check data types
data_types = metadata.dtypes
print("##### Data Types ##### \n", data_types)
print("\n")

# Check for missing values
missing_values = metadata.isnull().sum()
print("##### Missing values ##### \n", missing_values)
print("\n")

# Visualize the distribution of diseases
print("##### Displaying Distribution of Diseases (Lesion) #####")
sns.countplot(data=metadata, x='dx')
plt.title("Distribution of Skin Diseases in HAM10000 Dataset")
plt.xticks(rotation=45)
plt.show()

# Display sample images
print("##### Displaying sample images #####")
def display_sample_images(image_dir, num_samples=5):
    sample_files = os.listdir(image_dir)[:num_samples]
    plt.figure(figsize=(10, 5))
    for i, filename in enumerate(sample_files):
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(image)
        plt.axis('off')
    plt.show()

display_sample_images(image_dir_1)
display_sample_images(image_dir_2)
print("\n")

# Basic statistics of the dataset
print("##### Statistics of the dataset #####")
print(metadata.describe(include='all'))
print("\n")


In [None]:
# DATASET MANIPULATION
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os

# Define metadata path and image directories
metadata_path = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_metadata.csv'
image_dir_1 = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_images_part_1'
image_dir_2 = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_images_part_2'

# Load metadata
metadata = pd.read_csv(metadata_path)

# Check for missing values
print("##### Missing values #####")
print(metadata.isnull().sum())
print("\n")

# Separate numeric and non-numeric columns
numeric_columns = metadata.select_dtypes(include=['number']).columns
non_numeric_columns = metadata.select_dtypes(exclude=['number']).columns

print(f"Numeric columns: {numeric_columns}")
print(f"Non-numeric columns: {non_numeric_columns}")
print("\n")

# Fill missing values for numeric columns with mean
print("##### Filling missing values for numeric columns #####")
for column in numeric_columns:
    if metadata[column].isnull().sum() > 0:
        metadata[column].fillna(metadata[column].mean(), inplace=True)
        print(f"Missing values in '{column}' column filled with mean.")
    else:
        print(f"No missing values in '{column}' column.")
print("\n")

# Fill missing values for non-numeric columns with mode
print("##### Filling missing values for non-numeric columns #####")
for column in non_numeric_columns:
    if metadata[column].isnull().sum() > 0:
        metadata[column].fillna(metadata[column].mode()[0], inplace=True)
        print(f"Missing values in '{column}' column filled with mode.")
    else:
        print(f"No missing values in '{column}' column.")
print(metadata.isnull().sum())
print("\n")

# Save the updated metadata
metadata.to_csv(metadata_path, index=False)
print("Filled missing values saved to metadata.csv")
print("\n")

# Check for duplicate rows in metadata
print("##### Checking duplicates #####")
print(f"Duplicate rows in metadata: {metadata.duplicated().sum()}")
metadata.drop_duplicates(inplace=True)
print("Duplicate rows removed.")
print("\n")

# Check for duplicate image_id values
duplicate_image_ids = metadata[metadata['image_id'].duplicated()]['image_id']
if not duplicate_image_ids.empty:
    print(f"Duplicate image IDs found: {duplicate_image_ids.tolist()}")
else:
    print("No duplicate image IDs found.")

# Save the cleaned metadata
metadata.to_csv(metadata_path, index=False)
print("Cleaned metadata saved to metadata.csv")
print("\n")

# Verify that all images listed in the metadata are present in the folders
print("##### Verifying Missing images #####")
image_ids = set(metadata['image_id'])
all_image_files = set(os.listdir(image_dir_1) + os.listdir(image_dir_2))
missing_images = [img_id for img_id in image_ids if f"{img_id}.jpg" not in all_image_files]
if missing_images:
    print(f"Missing images: {missing_images}")
else:
    print("All images are accounted for.")
print("\n")


In [None]:
# HANDLING UNKNOW VALUES IN "SEX" AND "LOCALIZATION" COLUMN'S
import pandas as pd
# Load dataset
metadata_path = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_metadata.csv'
metadata = pd.read_csv(metadata_path)

Handle missing values for 'sex' and 'localization' columns
if 'unknown' in metadata['sex'].values:
    metadata['sex'].replace('unknown', metadata['sex'].mode()[0], inplace=True)
    print(f"'unknown' in 'sex' replaced with mode: {metadata['sex'].mode()[0]}")

if 'unknown' in metadata['localization'].values:
    metadata['localization'].replace('unknown', metadata['localization'].mode()[0], inplace=True)
    print(f"'unknown' in 'localization' replaced with mode: {metadata['localization'].mode()[0]}")

# Save the cleaned metadata
cleaned_metadata_path = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_metadata.csv'
metadata.to_csv(cleaned_metadata_path, index=False)

print(f"Cleaned metadata saved to: {cleaned_metadata_path}")

In [None]:
# CHECKING UNKNOW VALUES IN "SEX" AND "LOCALIZATION" COLUMN'S
import pandas as pd
# Load dataset
metadata_path = r'D:\skin_disease_detection\backend\data\Ham10000\HAM10000_metadata.csv'
metadata = pd.read_csv(metadata_path)

if 'unknown' in metadata['sex'].values:
    print("'unknown' exists in 'sex' column.")
else:
    print("'unknown' does not exist in 'sex' column.")

if 'unknown' in metadata['localization'].values:
    print("'unknown' exists in 'localization' column.")
else:
    print("'unknown' does not exist in 'localization' column.")