In [1]:
#import libraries
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision.transforms import transforms

In [2]:
#import label dataset
dataset = pd.read_csv('train-metadata.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,isic_id,patient_id,target
0,0,ISIC_0000000,dummy_0,0
1,1,ISIC_0000001,dummy_1,0
2,2,ISIC_0000002,dummy_2,1
3,3,ISIC_0000003,dummy_3,0
4,4,ISIC_0000004,dummy_4,1


In [3]:
dataset.info() #we need isic_id and target columns for the training purpose

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25330 entries, 0 to 25329
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25330 non-null  int64 
 1   isic_id     25330 non-null  object
 2   patient_id  25330 non-null  object
 3   target      25330 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 791.7+ KB


In [4]:
dataset['target'].value_counts() #to check the number of samples in each class

target
0    20808
1     4522
Name: count, dtype: int64

**Decsion**

There are 25330 images in the image data set. But we can see class imbalance in the data set. I suppose to do downsampling techniue to reduce the images having label 0. If not, the model will be  
- Poor Generalization  
The model learns patterns mainly from the majority class and struggles to recognize minority classes.

- High Accuracy but Poor Performance  
If 90% of the data is one class, the model can predict that class all the time and still get 90% accuracy, but it fails in real scenarios.

- Misleading Loss & Metrics  
Standard loss functions like CrossEntropyLoss assume balanced data, leading to incorrect optimization.  
Metrics like accuracy become unreliable.




In [5]:
#create a balanced dataset
df_class_1 = dataset[dataset['target'] == 1].sort_values('isic_id').head(100)
df_class_0 = dataset[dataset['target'] == 0].sort_values('isic_id').head(500)
balanced_df = pd.concat([df_class_1, df_class_0])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True).sort_values('isic_id')
balanced_df.head()

Unnamed: 0.1,Unnamed: 0,isic_id,patient_id,target
484,0,ISIC_0000000,dummy_0,0
52,1,ISIC_0000001,dummy_1,0
149,2,ISIC_0000002,dummy_2,1
599,3,ISIC_0000003,dummy_3,0
543,4,ISIC_0000004,dummy_4,1


In [6]:
#create blanced image directory
import os
import shutil
source_folder = r'D:\Self Study\3 Cancer Image Classification\image'  
destination_folder = 'train_images'
os.makedirs(destination_folder, exist_ok=True)
for isic_id in balanced_df['isic_id']:
    source_file = os.path.join(source_folder, f"{isic_id}.jpg")  
    destination_file = os.path.join(destination_folder, f"{isic_id}.jpg")
    if os.path.exists(source_file):  
        shutil.copy(source_file, destination_file)
    else:
        print(f"Image {isic_id}.jpg not found in source folder.")

print("Images have been successfully copied to the 'train_images' folder.")


Images have been successfully copied to the 'train_images' folder.


In [7]:
#save dataframe to csv
balanced_df.to_csv('balanced_dataset.csv',index=False)