# Melanoma Dermoscopic Prognosis - Colab Training Notebook

This notebook sets up the environment, downloads data from Google Drive, and trains the model using `main.py`.

**Note:** The model uses image-only input (dermoscopic images). Clinical features are not required.


## Step 1: Clone Repository


In [None]:
import os
import sys
from pathlib import Path

# Clone the repository
repo_url = "https://github.com/Salahuddin-quadri/Melanoma-Dermoscopic-Prognosis.git"
repo_name = "Melanoma-Dermoscopic-Prognosis"

# Check if we're already in the repo directory
if os.path.exists("src/main.py"):
    print("Already in repository directory. Skipping clone.")
    print(f"Current directory: {os.getcwd()}")
elif os.path.exists(repo_name):
    print(f"Repository {repo_name} already exists. Changing to it.")
    os.chdir(repo_name)
    print(f"Current directory: {os.getcwd()}")
else:
    # Clone the repository
    get_ipython().system(f'git clone {repo_url}')
    os.chdir(repo_name)
    print(f"Cloned and changed to: {os.getcwd()}")

# Verify we're in the right place
assert os.path.exists("src/main.py"), "src/main.py not found! Check repository structure."
print("✓ Repository setup complete!")


## Step 2: Install Dependencies


In [None]:
# Install PyTorch with CUDA support
!pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118

# Install other dependencies
!pip install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.4.2 scipy==1.11.4
!pip install matplotlib==3.8.4 seaborn==0.13.2 opencv-python==4.10.0.84 Pillow==10.4.0
!pip install tqdm==4.66.4 ipywidgets==8.1.3 imbalanced-learn==0.12.3
!pip install gdown

print("✓ Dependencies installed!")


## Step 3: Mount Google Drive and Download Data

**Important:** The data files are large (~1GB). We'll mount Google Drive and download directly from the shared folders.


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("✓ Google Drive mounted successfully!")
print("Your Drive is now accessible at /content/drive/MyDrive")


### Download Data Folder from Google Drive

The data folder contains:
- `images/` - dermoscopic images
- `merged_dataset.csv` - metadata CSV
- `meta_data.csv` - additional metadata


In [None]:
# Download the file from Google Drive
file_id = "1LCbiPcQXJperjrOhG4Xb947rkCiJPDRh"
output_filename = "downloaded_file.zip"

!gdown {file_id} -O {output_filename}

# Unzip only the 'data/' folder into the current directory
import zipfile
import os

with zipfile.ZipFile(output_filename, 'r') as zip_ref:
    members = [m for m in zip_ref.namelist() if m.startswith("data/")]
    zip_ref.extractall(members=members)

print("✓ 'data/' folder extracted to current directory.")
print("\nExtracted contents of 'data/':")
!ls -la data/

### Download DINO v3 Pretrained Model from Google Drive

The dino_v3 folder contains the pretrained checkpoint needed for training.


In [None]:
# Google Drive folder ID for dino_v3 model
dino_folder_id = "1wlozH7SchoPkAgsDpn-DyBN-F2Ea8hmP"

# Create dino_v3 directory
dino_dest = "dino_v3"
os.makedirs(dino_dest, exist_ok=True)

print(f"Downloading dino_v3 folder (ID: {dino_folder_id})...")
print("This may take a while as the folder is large...")

# Use gdown to download the entire folder
get_ipython().system(f'gdown --folder https://drive.google.com/drive/folders/{dino_folder_id} -O {dino_dest} --remaining-ok')

print("\n✓ DINO v3 model download complete!")


### Verify Downloaded Files


In [None]:
# Verify data files
data_dir = Path("data")
print("=" * 60)
print("DATA FOLDER VERIFICATION")
print("=" * 60)

if data_dir.exists():
    # Calculate total size
    total_size_mb = sum(f.stat().st_size for f in data_dir.rglob('*') if f.is_file()) / (1024 * 1024)
    print(f"Total size: {total_size_mb:.2f} MB")
    print(f"\nContents:")
    
    # Check for required files
    required_files = ["merged_dataset.csv", "meta_data.csv"]
    for file in required_files:
        file_path = data_dir / file
        if file_path.exists():
            size_mb = file_path.stat().st_size / (1024 * 1024)
            print(f"  ✓ {file} ({size_mb:.2f} MB)")
        else:
            print(f"  ✗ Missing: {file}")
    
    # Check for images folder
    images_dir = data_dir / "images"
    if images_dir.exists():
        num_images = len([f for f in images_dir.rglob("*") if f.is_file()])
        images_size_mb = sum(f.stat().st_size for f in images_dir.rglob("*") if f.is_file()) / (1024 * 1024)
        print(f"  ✓ images/ folder ({num_images} files, {images_size_mb:.2f} MB)")
    else:
        print(f"  ✗ Missing: images/ folder")
else:
    print("✗ Data directory not found!")

print("\n" + "=" * 60)
print("DINO_V3 FOLDER VERIFICATION")
print("=" * 60)

# Verify dino_v3 files
dino_dir = Path("dino_v3")
if dino_dir.exists():
    total_size_mb = sum(f.stat().st_size for f in dino_dir.rglob('*') if f.is_file()) / (1024 * 1024)
    print(f"Total size: {total_size_mb:.2f} MB")
    print(f"\nContents:")
    
    # Check for checkpoint
    checkpoint_path = dino_dir / "outputs_dino" / "checkpoints" / "best.pt"
    if checkpoint_path.exists():
        size_mb = checkpoint_path.stat().st_size / (1024 * 1024)
        print(f"  ✓ {checkpoint_path.relative_to(dino_dir)} ({size_mb:.2f} MB)")
    else:
        # Try alternative paths
        checkpoints = list(dino_dir.rglob("*.pt"))
        if checkpoints:
            for ckpt in checkpoints:
                size_mb = ckpt.stat().st_size / (1024 * 1024)
                print(f"  ✓ {ckpt.relative_to(dino_dir)} ({size_mb:.2f} MB)")
        else:
            print(f"  ✗ No checkpoint files (.pt) found")
    
    # List all files
    print(f"\nAll files in dino_v3:")
    for item in sorted(dino_dir.rglob("*")):
        if item.is_file():
            size_mb = item.stat().st_size / (1024 * 1024)
            print(f"  {item.relative_to(dino_dir)} ({size_mb:.2f} MB)")
else:
    print("✗ dino_v3 directory not found!")


## Step 4: Verify Setup


In [None]:
# Check Python version
import sys
print(f"Python version: {sys.version}")

# Check PyTorch
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Check repository structure
print("\nRepository structure:")
print(f"  Current directory: {os.getcwd()}")
print(f"  src/main.py exists: {os.path.exists('src/main.py')}")
print(f"  requirements.txt exists: {os.path.exists('requirements.txt')}")

print("\n✓ Setup verification complete!")


## Step 5: Training Configuration

Configure your training parameters here. Adjust as needed.

**Model Architecture:** The model takes only dermoscopic images as input (no clinical features). 
- DINO model: Uses Vision Transformer (ViT) backbone with domain-specific pretraining
- ResNet model: Uses ResNet50 backbone with ImageNet pretraining


In [None]:
# Training configuration
# Note: Model takes only images as input (no clinical features)
config = {
    "metadata_path": "data/meta_data.csv",
    "image_dir": "data/images",
    "mode": "train",
    "model_type": "dino",  # "dino" or "resnet"
    "epochs": 200,
    "batch_size": 32,
    "image_size": [224,224],
    "output_dir": "outputs",
    "task": "classification",  # "classification" or "regression" (used when multitask=False)
    "multitask": True,  # Set to True for dual-head (classification + regression)
    "loss_alpha": 0.5,  # Weight for classification loss in multitask (0-1)
    "cls_loss_type": "weighted_bce",  # "bce", "weighted_bce", or "focal"
    "focal_gamma": 2.0,  # For focal loss (used when cls_loss_type="focal")
    "freeze_backbone_layers": 7,  # Number of ViT layers to freeze (0 = all trainable)
    "val_size": 0.15,
    "test_size": 0.15,
    "device": "auto",  # "cuda", "cpu", or "auto"
}

# Set DINO checkpoint path
# Try to find the checkpoint automatically
dino_checkpoint_candidates = [
    "dino_v3/outputs_dino/checkpoints/best.pt",
    "dino_v3/checkpoints/best.pt",
]

dino_checkpoint = None
for candidate in dino_checkpoint_candidates:
    if os.path.exists(candidate):
        dino_checkpoint = candidate
        break

if dino_checkpoint:
    config["dino_checkpoint"] = dino_checkpoint
    print(f"✓ Found DINO checkpoint: {dino_checkpoint}")
else:
    print("⚠ No DINO checkpoint found. Will use ImageNet pretrained weights.")
    # Don't set dino_checkpoint if not found

print("\nTraining configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")


## Step 6: Run Training


In [None]:
# Build command arguments
args_list = []
for key, value in config.items():
    if value is not None and value != "":
        if isinstance(value, bool):
            if value:
                args_list.append(f"--{key}")
        elif isinstance(value, list):
            args_list.append(f"--{key}")
            args_list.extend([str(v) for v in value])
        else:
            args_list.append(f"--{key}")
            args_list.append(str(value))

# Convert to string
args_str = " ".join(args_list)

print("=" * 60)
print("TRAINING COMMAND")
print("=" * 60)
print(f"python -m src.main {args_str}")
print("=" * 60)
print("\nStarting training...\n")


In [None]:
# Run training
get_ipython().system(f'python -m src.main {args_str}')


In [None]:
# List output files
# Note: Training creates subdirectories train1, train2, etc. (YOLO-style organization)
output_dir = Path(config["output_dir"])
if output_dir.exists():
    print(f"Output directory: {output_dir}")
    print("\nContents:")
    
    # Find training run directories (train1, train2, etc.)
    train_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir() and d.name.startswith("train")])
    
    if train_dirs:
        latest_train_dir = train_dirs[-1]
        print(f"\nLatest training run: {latest_train_dir.name}")
        
        # List files in latest training run
        for item in latest_train_dir.rglob("*"):
            if item.is_file():
                size = item.stat().st_size / (1024 * 1024)  # Size in MB
                print(f"  {item.relative_to(output_dir)} ({size:.2f} MB)")
        
        # Check for checkpoints in latest training run
        checkpoint_dir = latest_train_dir / "checkpoints"
        if checkpoint_dir.exists():
            print(f"\nCheckpoints in {latest_train_dir.name}:")
            for ckpt in checkpoint_dir.glob("*.pt"):
                size = ckpt.stat().st_size / (1024 * 1024)
                print(f"  ✓ {ckpt.name} ({size:.2f} MB)")
        else:
            print(f"\n⚠ No checkpoints directory found in {latest_train_dir.name}")
    else:
        # Fallback: list all files directly
        for item in output_dir.rglob("*"):
            if item.is_file():
                size = item.stat().st_size / (1024 * 1024)  # Size in MB
                print(f"  {item.relative_to(output_dir)} ({size:.2f} MB)")
else:
    print(f"⚠ Output directory {output_dir} not found.")


## Optional: Save Results to Google Drive

After training, you can save the outputs to your Google Drive for later use.
