# Colorectal Histology Classification - Training on Kaggle

**Task**: 8-class tissue type classification from histopathology images

**Before running, make sure you have:**
1. GPU enabled: Settings -> Accelerator -> GPU P100 or T4
2. Added dataset: + Add Input -> Search `colorectal-histology-mnist` by Kevin Mader
3. Added your code: + Add Input -> Upload `project_for_colab.zip`

In [None]:
# Cell 1: Check GPU and find datasets
import torch
import os

print('=' * 50)
print('GPU CHECK')
print('=' * 50)
!nvidia-smi -L 2>/dev/null || echo 'No GPU found'
print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

print('\n' + '=' * 50)
print('DATASET SEARCH')
print('=' * 50)

# Show what kaggle has in /kaggle/input
print('\nAll directories in /kaggle/input:')
!find /kaggle/input -maxdepth 5 -type d | head -40

# Search for project (zip OR extracted folder) and colorectal data
project_zip_path = None
project_folder_path = None
colorectal_data_path = None

for root, dirs, files in os.walk('/kaggle/input'):
    # Find project zip file
    for f in files:
        if f.endswith('.zip') and 'project' in f.lower():
            project_zip_path = os.path.join(root, f)
    # Find extracted project folder (Kaggle auto-extracts zips)
    if 'train.py' in files and 'config.py' in files:
        project_folder_path = root
    # Find colorectal data (look for the numbered class folders)
    if '01_TUMOR' in dirs and '08_EMPTY' in dirs:
        colorectal_data_path = root

# Prefer extracted folder over zip
project_path = project_folder_path or project_zip_path
project_type = 'folder' if project_folder_path else ('zip' if project_zip_path else None)

print(f'\n{"=" * 50}')
print(f'Project:          {project_path or "NOT FOUND"} ({project_type or "N/A"})')
print(f'Colorectal data:  {colorectal_data_path or "NOT FOUND"}')
print(f'{"=" * 50}')

if not project_path:
    print('\n>>> ADD YOUR PROJECT: + Add Input -> Upload project_for_colab.zip')
if not colorectal_data_path:
    print('\n>>> ADD DATASET: + Add Input -> Search "colorectal-histology-mnist"')
if project_path and colorectal_data_path:
    print('\n[OK] Everything found! Continue to Cell 2.')

In [None]:
# Cell 2: Extract/copy project and write hyperparameters
import os, json, shutil

project_dir = '/kaggle/working/medical-image-classification'

if os.path.exists(project_dir):
    shutil.rmtree(project_dir)

if project_path is None:
    raise Exception('No project found! Go back to Cell 1 and add your project dataset.')

if project_type == 'zip':
    os.makedirs(project_dir, exist_ok=True)
    os.system(f'unzip -q "{project_path}" -d "{project_dir}"')
    print(f'[OK] Extracted zip: {project_path}')
else:
    # Kaggle auto-extracted the zip - copy the folder
    shutil.copytree(project_path, project_dir)
    print(f'[OK] Copied folder: {project_path}')

%cd {project_dir}

# Write colorectal hyperparameters
params = {
    'best_hyperparameters': {
        'learning_rate': 0.001,
        'batch_size': 32,
        'epochs': 70
    },
    'optimization_summary': {
        'method': 'transferred_from_brain_tumor',
        'note': '8-class classification needs more epochs than binary',
        'dataset': 'colorectal',
        'classes': 8
    },
    'dataset_info': {
        'name': 'Colorectal Cancer Histopathology',
        'classes': ['TUMOR', 'STROMA', 'COMPLEX', 'LYMPHO', 'DEBRIS', 'MUCOSA', 'ADIPOSE', 'EMPTY'],
        'num_classes': 8
    }
}
os.makedirs('results/phase1', exist_ok=True)
with open('results/phase1/best_hyperparameters.json', 'w') as f:
    json.dump(params, f, indent=2)

print('[OK] Hyperparameters written!')
print('\nProject files:')
!ls

In [None]:
# Cell 3: Install dependencies
!pip install -q albumentations scikit-image 2>/dev/null
import torch, torchvision, sklearn, PIL
print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')
print('[OK] Dependencies ready!')

In [None]:
# Cell 4: Split dataset into train/val/test and rename folders
import os, shutil, random
from pathlib import Path

if colorectal_data_path is None:
    raise Exception('No colorectal dataset found! Go back to Cell 1 and add it.')

print(f'Dataset source: {colorectal_data_path}')

# Mapping from Kather folder names to our class names
FOLDER_TO_CLASS = {
    '01_TUMOR': 'TUMOR',
    '02_STROMA': 'STROMA',
    '03_COMPLEX': 'COMPLEX',
    '04_LYMPHO': 'LYMPHO',
    '05_DEBRIS': 'DEBRIS',
    '06_MUCOSA': 'MUCOSA',
    '07_ADIPOSE': 'ADIPOSE',
    '08_EMPTY': 'EMPTY'
}

# Split ratios
TRAIN_RATIO = 0.80
VAL_RATIO = 0.10
TEST_RATIO = 0.10

random.seed(42)  # Reproducible splits

data_dir = Path('data/colorectal')

# Clean up any previous setup
if data_dir.exists():
    shutil.rmtree(data_dir)

# Create split directories
for split in ['train', 'val', 'test']:
    for cls in FOLDER_TO_CLASS.values():
        (data_dir / split / cls).mkdir(parents=True, exist_ok=True)

print(f'\nSplitting dataset ({TRAIN_RATIO:.0%} train / {VAL_RATIO:.0%} val / {TEST_RATIO:.0%} test):')
print(f'{"="*60}')

total_train = total_val = total_test = 0

for folder_name, class_name in FOLDER_TO_CLASS.items():
    src_dir = Path(colorectal_data_path) / folder_name
    if not src_dir.exists():
        print(f'  [WARN] {folder_name} not found, skipping')
        continue
    
    # Get all image files
    images = list(src_dir.glob('*.tif')) + list(src_dir.glob('*.jpg')) + list(src_dir.glob('*.png'))
    random.shuffle(images)
    
    n = len(images)
    n_train = int(n * TRAIN_RATIO)
    n_val = int(n * VAL_RATIO)
    n_test = n - n_train - n_val
    
    train_imgs = images[:n_train]
    val_imgs = images[n_train:n_train + n_val]
    test_imgs = images[n_train + n_val:]
    
    # Copy files (symlinks don't work well on Kaggle for nested paths)
    for img in train_imgs:
        shutil.copy2(str(img), str(data_dir / 'train' / class_name / img.name))
    for img in val_imgs:
        shutil.copy2(str(img), str(data_dir / 'val' / class_name / img.name))
    for img in test_imgs:
        shutil.copy2(str(img), str(data_dir / 'test' / class_name / img.name))
    
    total_train += n_train
    total_val += n_val
    total_test += n_test
    
    print(f'  {class_name:8s}: {n:4d} total -> {n_train:4d} train / {n_val:3d} val / {n_test:3d} test')

print(f'{"="*60}')
print(f'  TOTAL:   {total_train + total_val + total_test:4d} total -> {total_train:4d} train / {total_val:3d} val / {total_test:3d} test')
print(f'\n[OK] Dataset split and ready!')

In [None]:
# Cell 5: Verify config and dataset loader
!python -c "from config import get_config; c = get_config('colorectal'); print('Config:', c['dataset']['name'], '| Classes:', c['dataset']['num_classes'])"
!python -c "from src.datasets.colorectal import ColorectalDataset; print('Dataset loader: OK')"
print('\nHyperparameters:')
!cat results/phase1/best_hyperparameters.json
print('\n[OK] Ready to train!')

In [None]:
# Cell 6: TRAIN!
print('=' * 60)
print('  TRAINING: Colorectal Histology | ResNet-18 | 70 epochs')
print('  8 classes: TUMOR, STROMA, COMPLEX, LYMPHO,')
print('             DEBRIS, MUCOSA, ADIPOSE, EMPTY')
print('=' * 60 + '\n')

!python train.py --dataset colorectal --use_optimized --device cuda

print('\n' + '=' * 60)
print('  TRAINING COMPLETE!')
print('=' * 60)

In [None]:
# Cell 7: Results
import json, os

!ls -lh models/checkpoints/

if os.path.exists('results/phase1/training_history.json'):
    with open('results/phase1/training_history.json') as f:
        h = json.load(f)
    print(f'\nTrain Acc:  {h["train_acc"][-1]:.2f}%')
    print(f'Val Acc:    {h["val_acc"][-1]:.2f}%')
    print(f'Epochs:     {len(h["train_acc"])}')
    print(f'Best Val:   {max(h["val_acc"]):.2f}%')
else:
    print('\nNo training history found.')

In [None]:
# Cell 8: Save for download
import shutil

for f in ['models/checkpoints/best_model.pth', 'results/phase1/training_history.json', 'results/training_results_colorectal.json']:
    if os.path.exists(f):
        shutil.copy2(f, '/kaggle/working/')
        print(f'[OK] {os.path.basename(f)}')

print('\nFiles ready:')
!ls -lh /kaggle/working/*.pth /kaggle/working/*.json 2>/dev/null
print('\nClick "Save Version" -> "Save & Run All" -> then download from Output tab')