In [None]:
print("hi")

hi


In [None]:
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
# --- Cell 1: Kaggle Dataset Download & Extraction ---
!pip install -q kaggle

from google.colab import files
import os, zipfile

# Upload your kaggle.json (do this only once)
if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("Please upload your kaggle.json file.")
    files.upload()
    os.makedirs('/root/.kaggle', exist_ok=True)
    !mv kaggle.json /root/.kaggle/
    !chmod 600 /root/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d revatisn/cancer-benign-and-malignant -p /content/

# Unzip dataset
zip_path = '/content/cancer-benign-and-malignant.zip'
extract_dir = '/content/Data'
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

print("✅ Dataset extracted to:", extract_dir)
!find "$extract_dir" -maxdepth 2 -type d


Please upload your kaggle.json file.


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/revatisn/cancer-benign-and-malignant
License(s): MIT
Downloading cancer-benign-and-malignant.zip to /content
100% 5.21G/5.23G [03:08<00:00, 211MB/s]
100% 5.23G/5.23G [03:09<00:00, 29.7MB/s]
✅ Dataset extracted to: /content/Data
/content/Data
/content/Data/Data
/content/Data/Data/Malignant
/content/Data/Data/Benign


In [None]:
# --- Cell 2: Robust Dataset Path Detection ---
import os

# Try all possible root names
possible_roots = [
    "/content/Data",
    "/content/data",
    "/content/cancer-benign-and-malignant"
    "/content/cancer-benign-and-malignant/Data"

]

DATA_ROOT = None
for root in possible_roots:
    if os.path.exists(root):
        # Check for Benign/Malignant subfolders (case-insensitive)
        subdirs = [d.lower() for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
        if any("benign" in d for d in subdirs) and any("malignant" in d for d in subdirs):
            DATA_ROOT = root
            break
        # sometimes the dataset has one more subfolder inside
        for d in os.listdir(root):
            inner = os.path.join(root, d)
            if os.path.isdir(inner):
                inner_dirs = [x.lower() for x in os.listdir(inner)]
                if any("benign" in x for x in inner_dirs) and any("malignant" in x for x in inner_dirs):
                    DATA_ROOT = inner
                    break
    if DATA_ROOT:
        break

if DATA_ROOT is None:
    raise RuntimeError("Could not locate dataset. Please verify extraction path.")

print("✅ Using dataset root:", DATA_ROOT)
!find "$DATA_ROOT" -maxdepth 2 -type d


✅ Using dataset root: /content/Data/Data
/content/Data/Data
/content/Data/Data/Malignant
/content/Data/Data/Malignant/all_pro
/content/Data/Data/Malignant/colon_aca
/content/Data/Data/Malignant/breast_malignant
/content/Data/Data/Malignant/lung_aca
/content/Data/Data/Malignant/all_early
/content/Data/Data/Malignant/lung_scc
/content/Data/Data/Malignant/all_pre
/content/Data/Data/Malignant/oral_scc
/content/Data/Data/Benign
/content/Data/Data/Benign/lung_bnt
/content/Data/Data/Benign/oral_normal
/content/Data/Data/Benign/breast_benign
/content/Data/Data/Benign/colon_bnt
/content/Data/Data/Benign/all_benign


In [None]:
# --- Cell 3: Gather all image paths recursively ---
import glob, random, math

def gather_filepaths_by_class(root, classes):
    filepaths, labels, subfolders = [], [], []
    for cls in classes:
        cls_root = os.path.join(root, cls)
        patterns = ["**/*.png","**/*.jpg","**/*.jpeg","**/*.bmp"]
        cls_files = []
        for pat in patterns:
            cls_files += glob.glob(os.path.join(cls_root, pat), recursive=True)
        cls_files = sorted(cls_files)
        for fp in cls_files:
            filepaths.append(fp)
            labels.append(cls)
            rel = os.path.relpath(fp, cls_root)
            top_sub = rel.split(os.sep)[0] if os.sep in rel else ''
            subfolders.append(top_sub)
    return filepaths, labels, subfolders

classes = ["Benign", "Malignant"]
filepaths, labels, subfolders = gather_filepaths_by_class(DATA_ROOT, classes)
print("✅ Total images found:", len(filepaths))


✅ Total images found: 65002


In [None]:
# --- Cell 4: Train/Validation/Test Split ---
combined = list(zip(filepaths, labels, subfolders))
random.shuffle(combined)
filepaths, labels, subfolders = zip(*combined)

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15
n_total = len(filepaths)
n_train = int(train_ratio * n_total)
n_val = int(val_ratio * n_total)

train_fps, val_fps, test_fps = (
    filepaths[:n_train],
    filepaths[n_train:n_train + n_val],
    filepaths[n_train + n_val:]
)
train_labels, val_labels, test_labels = (
    labels[:n_train],
    labels[n_train:n_train + n_val],
    labels[n_train + n_val:]
)

print(f"Training: {len(train_fps)} | Validation: {len(val_fps)} | Test: {len(test_fps)}")

Training: 45501 | Validation: 9750 | Test: 9751


In [None]:
# --- Cell 5: Dataset and DataLoader ---
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from torchvision import transforms

class CancerDataset(Dataset):
    def __init__(self, filepaths, labels):
        self.filepaths = filepaths
        self.labels = [0 if l.lower()=="benign" else 1 for l in labels]
        self.transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        img = Image.open(self.filepaths[idx]).convert("RGB")
        return self.transform(img), torch.tensor(self.labels[idx])

batch_size = 16

train_loader = DataLoader(CancerDataset(train_fps, train_labels), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(CancerDataset(val_fps, val_labels), batch_size=batch_size)
test_loader  = DataLoader(CancerDataset(test_fps, test_labels), batch_size=batch_size)

print("✅ Dataloaders ready!")


✅ Dataloaders ready!


In [None]:
# --- Cell 6: MobileViT Model Setup ---
!pip install -q timm

import timm
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use small MobileViT variant
model = timm.create_model("mobilevit_s", pretrained=True, num_classes=2)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

print("✅ MobileViT model ready on:", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

✅ MobileViT model ready on: cuda


In [None]:
# --- Cell 7: Training Loop ---
from tqdm import tqdm

epochs = 20
best_val_acc = 0

for epoch in range(epochs):
    model.train()
    running_loss, correct, total = 0, 0, 0

    for imgs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_acc = correct / total
    val_acc = 0
    model.eval()
    with torch.no_grad():
        correct, total = 0, 0
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
        val_acc = correct / total

    print(f"Epoch [{epoch+1}/{epochs}] | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")


Epoch 1/20: 100%|██████████| 2844/2844 [10:42<00:00,  4.43it/s]


Epoch [1/20] | Train Acc: 0.9636 | Val Acc: 0.9810


Epoch 2/20: 100%|██████████| 2844/2844 [10:02<00:00,  4.72it/s]


Epoch [2/20] | Train Acc: 0.9880 | Val Acc: 0.9936


Epoch 3/20: 100%|██████████| 2844/2844 [09:48<00:00,  4.83it/s]


Epoch [3/20] | Train Acc: 0.9925 | Val Acc: 0.9947


Epoch 4/20: 100%|██████████| 2844/2844 [09:48<00:00,  4.84it/s]


Epoch [4/20] | Train Acc: 0.9948 | Val Acc: 0.9908


Epoch 5/20: 100%|██████████| 2844/2844 [09:47<00:00,  4.84it/s]


Epoch [5/20] | Train Acc: 0.9953 | Val Acc: 0.9953


Epoch 6/20: 100%|██████████| 2844/2844 [09:46<00:00,  4.85it/s]


Epoch [6/20] | Train Acc: 0.9960 | Val Acc: 0.9947


Epoch 7/20: 100%|██████████| 2844/2844 [09:47<00:00,  4.84it/s]


Epoch [7/20] | Train Acc: 0.9966 | Val Acc: 0.9954


Epoch 8/20: 100%|██████████| 2844/2844 [09:50<00:00,  4.82it/s]


Epoch [8/20] | Train Acc: 0.9971 | Val Acc: 0.9954


Epoch 9/20: 100%|██████████| 2844/2844 [09:48<00:00,  4.83it/s]


Epoch [9/20] | Train Acc: 0.9971 | Val Acc: 0.9939


Epoch 10/20: 100%|██████████| 2844/2844 [09:48<00:00,  4.83it/s]


Epoch [10/20] | Train Acc: 0.9979 | Val Acc: 0.9962


Epoch 11/20: 100%|██████████| 2844/2844 [09:47<00:00,  4.84it/s]


Epoch [11/20] | Train Acc: 0.9976 | Val Acc: 0.9935


Epoch 12/20: 100%|██████████| 2844/2844 [09:47<00:00,  4.84it/s]


Epoch [12/20] | Train Acc: 0.9982 | Val Acc: 0.9977


Epoch 13/20: 100%|██████████| 2844/2844 [09:48<00:00,  4.83it/s]


Epoch [13/20] | Train Acc: 0.9979 | Val Acc: 0.9958


Epoch 14/20: 100%|██████████| 2844/2844 [09:48<00:00,  4.83it/s]


Epoch [14/20] | Train Acc: 0.9985 | Val Acc: 0.9979


Epoch 15/20: 100%|██████████| 2844/2844 [09:45<00:00,  4.85it/s]


Epoch [15/20] | Train Acc: 0.9983 | Val Acc: 0.9957


Epoch 16/20: 100%|██████████| 2844/2844 [09:46<00:00,  4.85it/s]


Epoch [16/20] | Train Acc: 0.9987 | Val Acc: 0.9962


Epoch 17/20: 100%|██████████| 2844/2844 [09:46<00:00,  4.85it/s]


Epoch [17/20] | Train Acc: 0.9984 | Val Acc: 0.9947


Epoch 18/20: 100%|██████████| 2844/2844 [09:45<00:00,  4.86it/s]


Epoch [18/20] | Train Acc: 0.9988 | Val Acc: 0.9956


Epoch 19/20: 100%|██████████| 2844/2844 [09:47<00:00,  4.84it/s]


Epoch [19/20] | Train Acc: 0.9988 | Val Acc: 0.9963


Epoch 20/20: 100%|██████████| 2844/2844 [09:46<00:00,  4.85it/s]


Epoch [20/20] | Train Acc: 0.9990 | Val Acc: 0.9969


In [None]:
# --- Cell 8: Final Evaluation ---
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("✅ Evaluation complete.\n")
print("Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:\n", classification_report(all_labels, all_preds, target_names=["Benign", "Malignant"]))
print("Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))