In [1]:
!pip install medmnist
!pip install torch

Collecting medmnist
  Downloading medmnist-3.0.2-py3-none-any.whl.metadata (14 kB)
Collecting fire (from medmnist)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->medmnist)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->medmnist)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->medmnist)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->medmnist)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolv

In [2]:
!pip install torchmetrics



In [3]:
import medmnist
from medmnist import INFO
import torch
from torchvision import transforms
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC           
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import joblib
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [4]:
# Set a fixed random seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

data_flag = 'pathmnist'
info = INFO[data_flag]
DataClass = getattr(medmnist, info['python_class'])

data_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[.5], std=[.5])
])

train_data = DataClass(split='train', transform=data_transform, download=True)
val_data = DataClass(split='val', transform=data_transform, download=True)
test_data = DataClass(split='test', transform=data_transform, download=True)

Downloading https://zenodo.org/records/10519652/files/pathmnist.npz?download=1 to /root/.medmnist/pathmnist.npz


100%|██████████| 206M/206M [00:03<00:00, 67.5MB/s]


Using downloaded and verified file: /root/.medmnist/pathmnist.npz
Using downloaded and verified file: /root/.medmnist/pathmnist.npz


In [5]:
# check data properties
img = train_data[0][0]
label = train_data[0][1]

print(f"Image:\n {img}")
print(f"Label:\n {label}")

print(f"Image shape: {img.shape}")
print(f"Label: {label}")

Image:
 tensor([[[0.7255, 0.7176, 0.7255,  ..., 0.7255, 0.7176, 0.7333],
         [0.7098, 0.7255, 0.7176,  ..., 0.5451, 0.5059, 0.4902],
         [0.7255, 0.7255, 0.7176,  ..., 0.6314, 0.6235, 0.6392],
         ...,
         [0.7098, 0.7020, 0.7333,  ..., 0.7333, 0.7255, 0.7333],
         [0.6706, 0.7020, 0.7333,  ..., 0.7333, 0.7333, 0.7333],
         [0.6863, 0.7255, 0.7333,  ..., 0.7255, 0.7333, 0.7412]],

        [[0.6314, 0.6235, 0.6235,  ..., 0.6314, 0.6235, 0.6314],
         [0.6157, 0.6235, 0.6157,  ..., 0.3882, 0.3490, 0.3176],
         [0.6314, 0.6235, 0.6078,  ..., 0.4980, 0.5059, 0.5216],
         ...,
         [0.6078, 0.5765, 0.6314,  ..., 0.6314, 0.6314, 0.6392],
         [0.5059, 0.5686, 0.6314,  ..., 0.6314, 0.6392, 0.6314],
         [0.5294, 0.6235, 0.6314,  ..., 0.6314, 0.6314, 0.6392]],

        [[0.7804, 0.7804, 0.7804,  ..., 0.7804, 0.7804, 0.7804],
         [0.7725, 0.7725, 0.7725,  ..., 0.5843, 0.5451, 0.5294],
         [0.7725, 0.7725, 0.7647,  ..., 0.6706, 0.

In [6]:
# Number of image channels
n_channels = info['n_channels']
print(f"number of channels: {n_channels}")

# Number of classes
n_classes = len(info['label'])
print(f"number of classes: {n_classes}")

# Get the class names from the dataset
class_names = info['label']
print(f"class names: {class_names}")

number of channels: 3
number of classes: 9
class names: {'0': 'adipose', '1': 'background', '2': 'debris', '3': 'lymphocytes', '4': 'mucus', '5': 'smooth muscle', '6': 'normal colon mucosa', '7': 'cancer-associated stroma', '8': 'colorectal adenocarcinoma epithelium'}


In [8]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from skimage.exposure import equalize_hist
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix

# Preprocess: Flatten and normalize the images
def preprocess_images(data):
    x_data = []
    for img, label in data:
        img = img.numpy().flatten()  # Flatten and convert to numpy array
        x_data.append(img)
    return np.array(x_data)

# Preprocess the training and test data
x_train = preprocess_images(train_data)
y_train = np.array([label for _, label in train_data])
x_test = preprocess_images(test_data)
y_test = np.array([label for _, label in test_data])

# Ensure y_train and y_test are 1D arrays
y_train = y_train.ravel()  # Flatten the target to 1D array
y_test = y_test.ravel()    # Flatten the target to 1D array

x_train = x_train / 255.0
x_test = x_test / 255.0

# Feature Engineering: Add additional statistical features (mean, variance)
means_train = np.mean(x_train, axis=1)
variances_train = np.var(x_train, axis=1)
x_train_with_stats = np.column_stack((x_train, means_train, variances_train))

means_test = np.mean(x_test, axis=1)
variances_test = np.var(x_test, axis=1)
x_test_with_stats = np.column_stack((x_test, means_test, variances_test))

# Feature Selection: Apply PCA to reduce dimensionality
pca = PCA(n_components=100)  # Reduce to 100 components (tune this value as needed)
x_train_pca = pca.fit_transform(x_train_with_stats)
x_test_pca = pca.transform(x_test_with_stats)

# Feature Selection: Select top 50 features using SelectKBest with ANOVA F-test (f_classif)
selector = SelectKBest(f_classif, k=50)
x_train_selected = selector.fit_transform(x_train_pca, y_train)
x_test_selected = selector.transform(x_test_pca)

In [None]:
from sklearn.svm import SVC            
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

svm_pipe = Pipeline([
    ('scale', StandardScaler()),       
    ('svm',   SVC(kernel='rbf',          
                  class_weight='balanced',   
                  probability=False,          
                  random_state=42))
])

# Hyper-parameter grid for a quick search         
param_grid = {'svm__C':  [0.1, 1, 10],
              'svm__gamma': ['scale', 0.01, 0.001]}    # only for RBF

cv = StratifiedKFold(5, shuffle=True, random_state=42)
grid = GridSearchCV(svm_pipe,
                    param_grid,
                    cv=cv,
                    scoring='accuracy',
                    n_jobs=-1,
                    verbose=1)

grid.fit(x_train_selected, y_train)

print(f"Best CV accuracy: {grid.best_score_:.3f} "
      f"with params: {grid.best_params_}")

y_pred = grid.predict(x_test_selected)

print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=4))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

joblib.dump(grid.best_estimator_, 'svm.pkl')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best CV accuracy: 0.667 with params: {'svm__C': 10, 'svm__gamma': 0.01}

Classification report:
              precision    recall  f1-score   support

           0     0.8273    0.8879    0.8565      1338
           1     0.7572    0.9976    0.8609       847
           2     0.2896    0.7758    0.4218       339
           3     0.8679    0.6009    0.7102       634
           4     0.7627    0.5807    0.6594      1035
           5     0.6196    0.3851    0.4750       592
           6     0.5712    0.4494    0.5030       741
           7     0.4869    0.4418    0.4633       421
           8     0.6534    0.6148    0.6335      1233

    accuracy                         0.6662      7180
   macro avg     0.6484    0.6371    0.6204      7180
weighted avg     0.6945    0.6662    0.6659      7180

Confusion matrix:
[[1188    8   32    0   53   37   18    0    2]
 [   1  845    0    1    0    0    0    0    0]
 [   6    0  263    3    

['best_decision_tree_model.pkl']