In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
import kagglehub
import os

# ISIC Data

In [None]:
# import datasets
ISIC = kagglehub.dataset_download("nodoubttome/skin-cancer9-classesisic")
print(ISIC)


Using Colab cache for faster access to the 'skin-cancer9-classesisic' dataset.
/kaggle/input/skin-cancer9-classesisic


In [None]:
# Load dataset into pandas DataFrame
# This skin cancer dataset typically contains CSV files with metadata and image paths

# Look for CSV files
csv_files = []
for root, dirs, files in os.walk(ISIC):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print("Found CSV files:")
for csv_file in csv_files:
    print(f"- {csv_file}")

# Load the main dataset CSV (usually contains metadata)
if csv_files:
    # Load the first CSV file found
    df = pd.read_csv(csv_files[0])
    print(f"\nLoaded dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())
else:
    print("No CSV files found. This might be an image-only dataset.")
    # In this case, you might need to create a DataFrame from image filenames
    image_files = []
    for root, dirs, files in os.walk(ISIC):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_files.append(os.path.join(root, file))

    if image_files:
        df = pd.DataFrame({'image_path': image_files[:100]})  # Sample first 100 images
        print(f"Created DataFrame from image files: {df.shape}")
        print(df.head())

Found CSV files:
No CSV files found. This might be an image-only dataset.
Created DataFrame from image files: (100, 1)
                                          image_path
0  /kaggle/input/skin-cancer9-classesisic/Skin ca...
1  /kaggle/input/skin-cancer9-classesisic/Skin ca...
2  /kaggle/input/skin-cancer9-classesisic/Skin ca...
3  /kaggle/input/skin-cancer9-classesisic/Skin ca...
4  /kaggle/input/skin-cancer9-classesisic/Skin ca...


In [21]:
# If you have a specific CSV file you want to load, you can do:
# df = pd.read_csv(os.path.join(ISIC, 'specific_file.csv'))

# For image classification datasets, you might also want to:
# 1. Extract class labels from folder names or file paths
# 2. Create image loading functions
# 3. Prepare data for machine learning models

# Example: If images are organized in class folders
def create_image_dataframe(dataset_path):
    """Create a DataFrame with image paths and labels from folder structure"""
    data = []

    for root, dirs, files in os.walk(dataset_path):
        # Skip the root directory
        if root == dataset_path:
            continue

        # Get class name from folder name
        class_name = os.path.basename(root)

        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                full_path = os.path.join(root, file)
                data.append({
                    'image_path': full_path,
                    'class': class_name,
                    'filename': file
                })

    return pd.DataFrame(data)

# Try to create DataFrame from folder structure
try:
    image_df = create_image_dataframe(ISIC)
    if not image_df.empty:
        pass
        # print(f"Created image DataFrame: {image_df.shape}")
        # print(f"Classes found: {image_df['class'].unique()}")
        # print("\nSample data:")
        # print(image_df.head())

        # # Show class distribution
        # print("\nClass distribution:")
        # print(image_df['class'].value_counts())
except Exception as e:
    print(f"Could not create image DataFrame: {e}")

In [None]:
X = image_df['image_path']
y = image_df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
"""
num_random_train_samples = 300  # Adjust this number as needed
X_train = X_train.sample(n=num_random_train_samples, random_state=42)
# Get the corresponding labels for the sampled images
y_train = y_train.loc[X_train.index]

num_random_test_samples = 100  # Adjust this number as needed
X_test = X_test.sample(n=num_random_test_samples, random_state=42)
# Get the corresponding labels for the sampled images
y_test = y_test.loc[X_test.index]
"""


Training set size: 1649
Testing set size: 708


'\nnum_random_train_samples = 300  # Adjust this number as needed\nX_train = X_train.sample(n=num_random_train_samples, random_state=42)\n# Get the corresponding labels for the sampled images\ny_train = y_train.loc[X_train.index]\n\nnum_random_test_samples = 100  # Adjust this number as needed\nX_test = X_test.sample(n=num_random_test_samples, random_state=42)\n# Get the corresponding labels for the sampled images\ny_test = y_test.loc[X_test.index]\n'

In [None]:
# Define the target size for resizing images
target_size = (128, 128)

def load_and_flatten_image(image_path, target_size):
    """Loads an image, resizes it, and flattens it into a 1D array."""
    try:
        img = Image.open(image_path).convert('RGB') # Ensure image is in RGB format
        img = img.resize(target_size)
        img_array = np.array(img)
        # Flatten the array: (height, width, channels) -> (height * width * channels,)
        flattened_array = img_array.flatten()
        return flattened_array
    except Exception as e:
        print(f"Error loading or processing image {image_path}: {e}")
        return None # Return None if there's an error

# Apply the function to your training and testing image paths
# This might take some time depending on the number of images
print("Processing training images...")
X_train_processed = np.array([load_and_flatten_image(path, target_size) for path in X_train if load_and_flatten_image(path, target_size) is not None])

print("Processing testing images...")
X_test_processed = np.array([load_and_flatten_image(path, target_size) for path in X_test if load_and_flatten_image(path, target_size) is not None])


Processing training images...
Processing testing images...


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('mlp', MLPClassifier(hidden_layer_sizes=(100,), activation='relu',
solver='adam', max_iter=20000))
])
#Fit
pipeline.fit(X_train_processed, y_train)
#Get predictions
y_pred = pipeline.predict(X_test_processed)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming y_test contains the true labels for the test set
# and y_pred contains the predictions from your model on the test set

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report for more detailed metrics (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.4308

Classification Report:
                            precision    recall  f1-score   support

         actinic keratosis       0.03      0.03      0.03        39
      basal cell carcinoma       0.48      0.61      0.54       118
            dermatofibroma       0.31      0.33      0.32        33
                  melanoma       0.48      0.41      0.44       136
                     nevus       0.50      0.37      0.42       112
pigmented benign keratosis       0.54      0.57      0.55       144
      seborrheic keratosis       0.10      0.12      0.11        24
   squamous cell carcinoma       0.37      0.32      0.35        59
           vascular lesion       0.35      0.47      0.40        43

                  accuracy                           0.43       708
                 macro avg       0.35      0.36      0.35       708
              weighted avg       0.43      0.43      0.43       708



# SIIM ISIC Data

In [18]:
SIIM_ISIC = kagglehub.dataset_download("rishidamarla/cancer-patients-data")

Using Colab cache for faster access to the 'cancer-patients-data' dataset.


In [19]:
# Explore the downloaded dataset structure
print("Dataset location:", SIIM_ISIC)
print("\nFiles and folders in the dataset:")
for root, dirs, files in os.walk(SIIM_ISIC):
    level = root.replace(SIIM_ISIC, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files in each directory
        print(f"{subindent}{file}")
    if len(files) > 5:
        print(f"{subindent}... and {len(files)-5} more files")

Dataset location: /kaggle/input/cancer-patients-data

Files and folders in the dataset:
cancer-patients-data/
  cancer patient data sets.xlsx


In [20]:
csv_files = []
for root, dirs, files in os.walk(SIIM_ISIC):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print("Found CSV files:")
for csv_file in csv_files:
    print(f"- {csv_file}")

# Load the main dataset CSV (usually contains metadata)
if csv_files:
    # Load the first CSV file found
    df = pd.read_csv(csv_files[0])
    print(f"\nLoaded dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())
else:
    print("No CSV files found. This might be an image-only dataset.")
    # In this case, you might need to create a DataFrame from image filenames
    image_files = []
    for root, dirs, files in os.walk(ISIC):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_files.append(os.path.join(root, file))

    if image_files:
        df = pd.DataFrame({'image_path': image_files[:100]})  # Sample first 100 images
        print(f"Created DataFrame from image files: {df.shape}")
        print(df.head())

Found CSV files:
No CSV files found. This might be an image-only dataset.
Created DataFrame from image files: (100, 1)
                                          image_path
0  /kaggle/input/skin-cancer9-classesisic/Skin ca...
1  /kaggle/input/skin-cancer9-classesisic/Skin ca...
2  /kaggle/input/skin-cancer9-classesisic/Skin ca...
3  /kaggle/input/skin-cancer9-classesisic/Skin ca...
4  /kaggle/input/skin-cancer9-classesisic/Skin ca...
