# Melanoma Dermoscopic Prognosis - Colab Training Notebook

This notebook sets up the environment, downloads data from Google Drive, and trains the model using `main.py`.


## Step 1: Install Dependencies


In [None]:
# Install required packages
!pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
!pip install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.4.2 scipy==1.11.4
!pip install matplotlib==3.8.4 seaborn==0.13.2 opencv-python==4.10.0.84 Pillow==10.4.0
!pip install tqdm==4.66.4 ipywidgets==8.1.3 imbalanced-learn==0.12.3
!pip install gdown  # For downloading from Google Drive


## Step 2: Clone Repository


In [None]:
import os
import sys
from pathlib import Path

# Clone the repository (replace with your actual GitHub repo URL)
# If you need to get the repo URL, you can check the git remote in your local repo:
# git remote get-url origin
repo_url = "https://github.com/yourusername/Melanoma-Dermoscopic-Prognosis.git"  # UPDATE THIS
repo_name = "Melanoma-Dermoscopic-Prognosis"

# Check if we're already in the repo directory
if os.path.exists("src/main.py"):
    print("Already in repository directory. Skipping clone.")
    print(f"Current directory: {os.getcwd()}")
elif os.path.exists(repo_name):
    print(f"Repository {repo_name} already exists. Changing to it.")
    os.chdir(repo_name)
    print(f"Current directory: {os.getcwd()}")
else:
    # Clone the repository
    get_ipython().system(f'git clone {repo_url}')
    os.chdir(repo_name)
    print(f"Cloned and changed to: {os.getcwd()}")

# Verify we're in the right place
assert os.path.exists("src/main.py"), "src/main.py not found! Check repository structure."
print("✓ Repository setup complete!")


## Step 3: Download Data from Google Drive

**Note:** If the Google Drive folder is not publicly accessible, you may need to:
1. Make the folder shareable (anyone with link can view), OR
2. Use the Google Drive mounting method below (requires authentication)


In [None]:
# Google Drive folder ID
drive_folder_id = "1P9YYZJbTQsadjvwXRvKTMUdm_TXAMR9t"

# Create data directory if it doesn't exist
os.makedirs("data", exist_ok=True)

# Method 1: Using gdown (requires folder to be publicly accessible)
# Uncomment and use this if the folder is public:
get_ipython().system(f'gdown --folder https://drive.google.com/drive/folders/{drive_folder_id} -O data --remaining-ok')

# Method 2: Mount Google Drive (if you have access to the folder)
# Uncomment the following lines if Method 1 doesn't work:
# from google.colab import drive
# drive.mount('/content/drive')
# # Then copy files from your Drive folder to the data directory
# import shutil
# # Adjust the path below to match your Drive folder structure
# # shutil.copytree('/content/drive/MyDrive/your_folder_path', 'data', dirs_exist_ok=True)

print("\nData download complete!")


In [None]:
# Verify data files
import os
from pathlib import Path

data_dir = Path("data")
print("Data directory contents:")
for item in data_dir.rglob("*"):
    if item.is_file():
        print(f"  {item.relative_to(data_dir)}")

# Check for required files
required_files = ["merged_dataset.csv", "meta_data.csv"]
for file in required_files:
    file_path = data_dir / file
    if file_path.exists():
        print(f"✓ Found: {file}")
    else:
        print(f"✗ Missing: {file}")

# Check for images folder
images_dir = data_dir / "images"
if images_dir.exists():
    num_images = len(list(images_dir.glob("*")))
    print(f"✓ Found images folder with {num_images} items")
else:
    print("✗ Missing images folder")


## Step 4: Verify Setup


In [None]:
# Check Python version
import sys
print(f"Python version: {sys.version}")

# Check PyTorch
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Check repository structure
print("\nRepository structure:")
print(f"  Current directory: {os.getcwd()}")
print(f"  src/main.py exists: {os.path.exists('src/main.py')}")
print(f"  requirements.txt exists: {os.path.exists('requirements.txt')}")


## Step 5: Training Configuration

Configure your training parameters here. Adjust as needed.


In [None]:
# Training configuration
config = {
    "metadata_path": "data/merged_dataset.csv",
    "image_dir": "data/images",
    "mode": "train",
    "model_type": "dino",  # or "resnet"
    "epochs": 30,
    "batch_size": 16,
    "image_size": [384, 384],
    "output_dir": "outputs",
    "task": "classification",  # or "regression"
    "multitask": False,  # Set to True for dual-head (classification + regression)
    "loss_alpha": 0.5,  # Weight for classification loss in multitask
    "cls_loss_type": "weighted_bce",  # "bce", "weighted_bce", or "focal"
    "focal_gamma": 2.0,  # For focal loss
    "fusion_type": "cross_attention",  # "cross_attention" or "concat"
    "freeze_backbone_layers": 7,  # Number of ViT layers to freeze
    "val_size": 0.15,
    "test_size": 0.15,
    "device": "auto",  # "cuda", "cpu", or "auto"
}

# Optional: DINO checkpoint path (if you have a pretrained checkpoint)
# config["dino_checkpoint"] = "dino_v3/outputs_dino/checkpoints/best.pt"

print("Training configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")


## Step 6: Run Training


In [None]:
# Build command arguments
args_list = []
for key, value in config.items():
    if value is not None and value != "":
        if isinstance(value, bool):
            if value:
                args_list.append(f"--{key}")
        elif isinstance(value, list):
            args_list.append(f"--{key}")
            args_list.extend([str(v) for v in value])
        else:
            args_list.append(f"--{key}")
            args_list.append(str(value))

# Convert to string
args_str = " ".join(args_list)

print(f"Running training with command:")
print(f"python -m src.main {args_str}")
print("\n" + "="*60)
print("Starting training...")
print("="*60 + "\n")


In [None]:
# Run training
get_ipython().system(f'python -m src.main {args_str}')


## Step 7: Check Training Results


In [None]:
# List output files
output_dir = Path(config["output_dir"])
if output_dir.exists():
    print(f"Output directory: {output_dir}")
    print("\nContents:")
    for item in output_dir.rglob("*"):
        if item.is_file():
            size = item.stat().st_size / (1024 * 1024)  # Size in MB
            print(f"  {item.relative_to(output_dir)} ({size:.2f} MB)")
    
    # Check for checkpoints
    checkpoint_dir = output_dir / "checkpoints"
    if checkpoint_dir.exists():
        print("\nCheckpoints:")
        for ckpt in checkpoint_dir.glob("*.pt"):
            size = ckpt.stat().st_size / (1024 * 1024)
            print(f"  {ckpt.name} ({size:.2f} MB)")
else:
    print(f"Output directory {output_dir} not found.")


## Optional: Download Results to Local Machine

After training, you can download the checkpoints and logs to your local machine.


In [None]:
# Create a zip file of outputs (optional)
# !zip -r outputs.zip {config['output_dir']}
# print("Outputs zipped to outputs.zip. You can download it from the Colab file browser.")
