# Project Name -----> Skin Cancer Detection

In [3]:
# importing requred libararies 
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split

In [4]:
# Load the labels
skin_data = pd.read_csv('skin_cancer_labels.csv') 
skin_data  =  skin_data.copy()


In [5]:
skin_data.head()

Unnamed: 0,isic_id,attribution,copyright_license,age_approx,anatom_site_general,anatom_site_special,clin_size_long_diam_mm,concomitant_biopsy,dermoscopic_type,diagnosis_1,...,diagnosis_confirm_type,family_hx_mm,image_type,lesion_id,melanocytic,patient_id,personal_hx_mm,pixels_x,pixels_y,sex
0,ISIC_0000000,Anonymous,CC-0,55.0,anterior torso,,,False,,Benign,...,,,dermoscopic,,True,,,1022,767,female
1,ISIC_0000001,Anonymous,CC-0,30.0,anterior torso,,,False,,Benign,...,,,dermoscopic,,True,,,1022,767,female
2,ISIC_0000002,Anonymous,CC-0,60.0,upper extremity,,,True,,Malignant,...,histopathology,,dermoscopic,,True,,,1022,767,female
3,ISIC_0000003,Anonymous,CC-0,30.0,upper extremity,,,False,,Benign,...,,,dermoscopic,,True,,,1022,767,male
4,ISIC_0000004,Anonymous,CC-0,80.0,posterior torso,,,True,,Malignant,...,histopathology,,dermoscopic,,True,,,1022,767,male


In [6]:
skin_data.isnull().sum()

isic_id                      0
attribution                  0
copyright_license            0
age_approx                 256
anatom_site_general        471
anatom_site_special       1961
clin_size_long_diam_mm    1538
concomitant_biopsy           0
dermoscopic_type          1190
diagnosis_1                  0
diagnosis_2                  0
diagnosis_3                  1
diagnosis_4               1354
diagnosis_5               1752
diagnosis_confirm_type     342
family_hx_mm              1548
image_type                   0
lesion_id                 1246
melanocytic                  0
patient_id                1863
personal_hx_mm            1539
pixels_x                     0
pixels_y                     0
sex                        229
dtype: int64

In [7]:
skin_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   isic_id                 2000 non-null   object 
 1   attribution             2000 non-null   object 
 2   copyright_license       2000 non-null   object 
 3   age_approx              1744 non-null   float64
 4   anatom_site_general     1529 non-null   object 
 5   anatom_site_special     39 non-null     object 
 6   clin_size_long_diam_mm  462 non-null    float64
 7   concomitant_biopsy      2000 non-null   bool   
 8   dermoscopic_type        810 non-null    object 
 9   diagnosis_1             2000 non-null   object 
 10  diagnosis_2             2000 non-null   object 
 11  diagnosis_3             1999 non-null   object 
 12  diagnosis_4             646 non-null    object 
 13  diagnosis_5             248 non-null    object 
 14  diagnosis_confirm_type  1658 non-null   

In [8]:
skin_data.columns.to_list()

['isic_id',
 'attribution',
 'copyright_license',
 'age_approx',
 'anatom_site_general',
 'anatom_site_special',
 'clin_size_long_diam_mm',
 'concomitant_biopsy',
 'dermoscopic_type',
 'diagnosis_1',
 'diagnosis_2',
 'diagnosis_3',
 'diagnosis_4',
 'diagnosis_5',
 'diagnosis_confirm_type',
 'family_hx_mm',
 'image_type',
 'lesion_id',
 'melanocytic',
 'patient_id',
 'personal_hx_mm',
 'pixels_x',
 'pixels_y',
 'sex']

In [9]:
skin_data.describe()

Unnamed: 0,age_approx,clin_size_long_diam_mm,pixels_x,pixels_y
count,1744.0,462.0,2000.0,2000.0
mean,49.03383,6.030952,2734.293,1877.673
std,19.731372,5.512854,1682.314035,1081.419839
min,5.0,1.0,576.0,540.0
25%,35.0,4.0,1024.0,768.0
50%,50.0,5.0,3008.0,2000.0
75%,65.0,7.0,4288.0,2848.0
max,85.0,100.0,6748.0,4499.0


In [10]:
skin_data.sex

0       female
1       female
2       female
3         male
4         male
         ...  
1995      male
1996      male
1997    female
1998      male
1999      male
Name: sex, Length: 2000, dtype: object

In [11]:
# shape of data
skin_data.shape

(2000, 24)

In [12]:
# Checking  images folder
image_folder = 'skin_cancer_data'
image_files = os.listdir(image_folder)
print(f"Number of images: {len(image_files)}")
print("First 5 image names:")
for i in range(5):
    print(image_files[i])

Number of images: 2000
First 5 image names:
ISIC_0000000.jpg
ISIC_0000001.jpg
ISIC_0000002.jpg
ISIC_0000003.jpg
ISIC_0000004.jpg


In [13]:
# Check how many benign vs malignant cases you have
print("Diagnosis distribution:")
print(skin_data['diagnosis_1'].value_counts())
print("\nPercentages:")
print(skin_data['diagnosis_1'].value_counts(normalize=True) * 100)

Diagnosis distribution:
diagnosis_1
Benign           1625
Malignant         374
Indeterminate       1
Name: count, dtype: int64

Percentages:
diagnosis_1
Benign           81.25
Malignant        18.70
Indeterminate     0.05
Name: proportion, dtype: float64


In [14]:
image_folder = 'skin_cancer_data'
image_files = os.listdir(image_folder)
print(f"Number of images: {len(image_files)}")
print("Sample image names:")
print(image_files[:3])

Number of images: 2000
Sample image names:
['ISIC_0000000.jpg', 'ISIC_0000001.jpg', 'ISIC_0000002.jpg']


In [15]:
skin_data.columns   

Index(['isic_id', 'attribution', 'copyright_license', 'age_approx',
       'anatom_site_general', 'anatom_site_special', 'clin_size_long_diam_mm',
       'concomitant_biopsy', 'dermoscopic_type', 'diagnosis_1', 'diagnosis_2',
       'diagnosis_3', 'diagnosis_4', 'diagnosis_5', 'diagnosis_confirm_type',
       'family_hx_mm', 'image_type', 'lesion_id', 'melanocytic', 'patient_id',
       'personal_hx_mm', 'pixels_x', 'pixels_y', 'sex'],
      dtype='object')

In [16]:
# removing unwanted columns 
skin_data.drop(['attribution','copyright_license','diagnosis_2','diagnosis_3','diagnosis_4',
                'diagnosis_5','pixels_x','pixels_y','patient_id','lesion_id','anatom_site_general',
                'anatom_site_special', 'concomitant_biopsy', 'dermoscopic_type',
                'diagnosis_confirm_type','family_hx_mm','image_type','personal_hx_mm','melanocytic'], axis=1 , inplace=True)

In [17]:
skin_data.head() #'isic_id', 'diagnosis_1', 'clin_size_long_diam_mm', 'age_approx'

Unnamed: 0,isic_id,age_approx,clin_size_long_diam_mm,diagnosis_1,sex
0,ISIC_0000000,55.0,,Benign,female
1,ISIC_0000001,30.0,,Benign,female
2,ISIC_0000002,60.0,,Malignant,female
3,ISIC_0000003,30.0,,Benign,male
4,ISIC_0000004,80.0,,Malignant,male


In [18]:
skin_data.isnull().sum()

isic_id                      0
age_approx                 256
clin_size_long_diam_mm    1538
diagnosis_1                  0
sex                        229
dtype: int64

In [None]:
# Removig the 1 indeterminate case another method is drop 

skin_data_clean = skin_data[skin_data['diagnosis_1'] != 'Indeterminate']

# balancing the dataset 
malignant = skin_data_clean[skin_data_clean['diagnosis_1']=='Malignant']
bengin = skin_data_clean[skin_data_clean['diagnosis_1']=='Benign'].sample(374 , random_state=42) # 374 cases
skin_balanced = pd.concat([malignant ,bengin])



In [20]:
skin_balanced.value_counts()

isic_id       age_approx  clin_size_long_diam_mm  diagnosis_1  sex   
ISIC_0009868  70.0        4.0                     Malignant    female    1
ISIC_0009871  45.0        5.0                     Benign       male      1
ISIC_0009873  40.0        10.0                    Benign       female    1
ISIC_0009882  60.0        5.0                     Malignant    female    1
ISIC_0009896  30.0        9.0                     Benign       male      1
                                                                        ..
ISIC_0011348  75.0        6.0                     Malignant    male      1
ISIC_0011349  65.0        12.0                    Malignant    male      1
ISIC_0011366  65.0        8.0                     Malignant    male      1
ISIC_0011387  65.0        8.0                     Malignant    female    1
ISIC_0011393  70.0        2.0                     Benign       female    1
Name: count, Length: 173, dtype: int64