In [1]:
# ==============================
# STEP 0: INSTALL & IMPORTS
# ==============================


import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import nibabel as nib
from pathlib import Path

from nilearn import plotting, image
from nilearn.maskers import NiftiMasker

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv3D, MaxPooling3D, Dense, Dropout, 
    Flatten, BatchNormalization, GlobalAveragePooling3D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.20.0
GPU Available: []


# 3D CNN for fMRI-Based ASD Detection

**Problem Statement:** Developing Predictive Models for Early ASD Detection in Young Children Based on fMRI Scans

## Dataset: ABIDE-II (Autism Brain Imaging Data Exchange II)

**ABIDE-II** is the most comprehensive autism neuroimaging dataset for young children:
- **Size**: 1,000+ participants across 19 sites
- **Target Population**: 372 young children (ages 5-10 years) for early detection
  - **168 ASD** and **204 Control** subjects
- **Data Type**: Resting-state fMRI (rs-fMRI) scans
- **Preprocessing**: DPARSF (Data Processing Assistant for Resting-State fMRI)
- **Format**: NIfTI files (.nii.gz)

## üì• DATA DOWNLOAD REQUIRED

**‚ö†Ô∏è Important:** ABIDE-II preprocessed data must be downloaded manually from NITRC.

### Steps to Download:
1. Visit: https://fcon_1000.projects.nitrc.org/indi/abide/
2. Register for a free NITRC account
3. Download **ABIDE-II Preprocessed Data (DPARSF pipeline)**
4. Select **filt_noglobal strategy** (recommended for ASD studies)
5. Extract files to: `./abide2_fmri_data/`

### Required File Structure:
```
./abide2_fmri_data/
  ‚îú‚îÄ‚îÄ ABIDEII-EMC_1_0029864/
  ‚îÇ   ‚îî‚îÄ‚îÄ ABIDEII-EMC_1_0029864_func_preproc.nii.gz
  ‚îú‚îÄ‚îÄ ABIDEII-EMC_1_0029866/
  ‚îÇ   ‚îî‚îÄ‚îÄ ABIDEII-EMC_1_0029866_func_preproc.nii.gz
  ‚îî‚îÄ‚îÄ ... (372 subjects total)
```

### DPARSF Preprocessing Pipeline:
1. **Slice timing correction** - accounts for different acquisition times
2. **Realignment** - corrects head motion (6-parameter rigid body)
3. **Co-registration** - aligns T1 structural to functional images
4. **Segmentation** - separates GM, WM, CSF
5. **Normalization** - DARTEL registration to MNI space
6. **Motion artifact removal** - Friston 24-parameter model
7. **Nuisance regression** - removes WM, CSF signals
8. **Temporal filtering** - 0.01-0.1 Hz bandpass
9. **Spatial smoothing** - 6mm FWHM Gaussian kernel

### Why DPARSF + filt_noglobal?
- **DPARSF**: Most comprehensive preprocessing for ASD research
- **filt_noglobal**: Avoids GSR artifacts that may distort ASD group differences

## Requirements:
- nibabel: Read NIfTI files
- scipy: Image resampling
- tensorflow: Deep learning framework

## Step 1: Data Download Instructions

### ‚ö†Ô∏è Important Note: ABIDE-II Data Availability

**ABIDE-II preprocessed fMRI data is NOT available on public AWS S3.** Only ABIDE-I is accessible via S3.

### Option 1: Download ABIDE-II Manually (Recommended for Production)

**NITRC (NeuroImaging Tools & Resources Collaboratory):**
1. Visit: https://fcon_1000.projects.nitrc.org/indi/abide/
2. Register for free account
3. Download ABIDE-II preprocessed data (DPARSF pipeline)
4. Extract to `./abide2_fmri_data/` directory

**OpenNeuro:**
- ABIDE-II collection: https://openneuro.org/
- Raw data available, requires preprocessing

**File Structure After Download:**
```
./abide2_fmri_data/
  ‚îî‚îÄ‚îÄ ABIDEII-EMC_1_0029864/
      ‚îî‚îÄ‚îÄ ABIDEII-EMC_1_0029864_func_preproc.nii.gz
  ‚îî‚îÄ‚îÄ ABIDEII-EMC_1_0029866/
      ‚îî‚îÄ‚îÄ ABIDEII-EMC_1_0029866_func_preproc.nii.gz
  ...
```

### Option 2: Use ABIDE-I via AWS S3 (Available for Testing)

**ABIDE-I data** IS available on public AWS S3 (no credentials required).

**URL Template:**
```
https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/[pipeline]/[strategy]/[derivative]/[FILE_ID]_[derivative].[ext]
```

**Parameters:**
- `[pipeline]`: **cpac** | dparsf | ccs | niak
- `[strategy]`: **filt_noglobal** (recommended) | filt_global | nofilt_global | nofilt_noglobal
- `[derivative]`: **func_preproc** (preprocessed 4D fMRI)
- `[FILE_ID]`: Site + Subject (e.g., `KKI_0050822`, `NYU_0050952`)

**Example URL for ABIDE-I:**
```
https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/cpac/filt_noglobal/func_preproc/NYU_0050952_func_preproc.nii.gz
```

**File Sizes:**
- `func_preproc` (4D fMRI): **30-200 MB per subject**

### Why DPARSF + filt_noglobal?
- **DPARSF**: Most comprehensive preprocessing for ASD research
- **filt_noglobal**: Avoids GSR artifacts that may distort ASD group differences

In [2]:
# ==============================
# STEP 1: DOWNLOAD ABIDE-II DATA
# ==============================

import urllib.request
from pathlib import Path

# Set data directory for ABIDE-II
data_dir = "./abide2_fmri_data"
os.makedirs(data_dir, exist_ok=True)

def download_abide_fmri_aws(file_id, output_dir="./abide2_fmri_data", pipeline="dparsf", strategy="filt_noglobal"):
    """
    Download a single preprocessed fMRI file from AWS S3 (ABIDE-II)
    Requires internet access but no credentials for public data
    
    Args:
        file_id: FILE_ID from ABIDE-II (e.g., 'ABIDEII-BNI_1_0029006')
        output_dir: Local directory to save files
        pipeline: dparsf (recommended for ABIDE-II)
        strategy: filt_noglobal (recommended for ASD studies)
    
    Returns:
        Path to downloaded file or None if failed
    """
    # Remove "ABIDEII-" prefix for S3 URL (S3 uses format: EMC_1_0029864)
    s3_file_id = file_id.replace('ABIDEII-', '') if file_id.startswith('ABIDEII-') else file_id
    
    # FCP-INDI S3 bucket (public, no credentials needed)
    base_url = "https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs"
    url = f"{base_url}/{pipeline}/{strategy}/func_preproc/{s3_file_id}_func_preproc.nii.gz"
    
    output_path = Path(output_dir) / file_id
    output_path.mkdir(parents=True, exist_ok=True)
    output_file = output_path / f"{file_id}_func_preproc.nii.gz"
    
    if output_file.exists():
        return str(output_file)
    
    try:
        print(f"Downloading {file_id}...", end=' ')
        urllib.request.urlretrieve(url, output_file, reporthook=lambda b,c,s: None)
        file_size_mb = output_file.stat().st_size / (1024 * 1024)
        print(f"‚úì ({file_size_mb:.1f} MB)")
        return str(output_file)
    except urllib.error.HTTPError as e:
        if output_file.exists():
            output_file.unlink()
        return None
    except Exception as e:
        if output_file.exists():
            output_file.unlink()
        return None


print(f"Data directory: {data_dir}")
print(f"Pipeline: DPARSF (Data Processing Assistant for Resting-State fMRI)")
print(f"Strategy: filt_noglobal (filtered, no GSR - recommended for ASD)")
print(f"Data source: AWS S3 - fcp-indi.s3.amazonaws.com (public, no credentials)")
print("‚úì Ready to download ABIDE-II data. Run next cell to load phenotypic file.")

Data directory: ./abide2_fmri_data
Pipeline: DPARSF (Data Processing Assistant for Resting-State fMRI)
Strategy: filt_noglobal (filtered, no GSR - recommended for ASD)
Data source: AWS S3 - fcp-indi.s3.amazonaws.com (public, no credentials)
‚úì Ready to download ABIDE-II data. Run next cell to load phenotypic file.


In [3]:
# ==============================
# STEP 2: LOAD ABIDE-II fMRI DATA FOR YOUNG CHILDREN
# ==============================

pheno_file = "ABIDEII_Composite_Phenotypic.csv"

try:
    df = pd.read_csv(pheno_file, encoding='latin1')
    df.columns = df.columns.str.strip()
    print(f"‚úì Loaded ABIDE-II phenotypic data: {df.shape[0]} total subjects")
except FileNotFoundError:
    print(f"‚ùå Error: {pheno_file} not found!")
    print("   Download from: https://fcon_1000.projects.nitrc.org/indi/abide/")
    raise

# Create labels: 1 = ASD, 0 = Control (DX_GROUP: 1=ASD, 2=Control)
df['label'] = df['DX_GROUP'].map({1: 1, 2: 0})
df = df.dropna(subset=['label'])

# Create FILE_ID from SITE_ID and SUB_ID (format: SITE_SUB_ID)
# Example: ABIDEII-BNI_1 + 29006 -> ABIDEII-BNI_1_0029006
df['FILE_ID'] = df['SITE_ID'].astype(str) + '_' + df['SUB_ID'].astype(str).str.zfill(7)

# ===== FILTER FOR YOUNG CHILDREN (5-10 years) =====
# Early ASD Detection in School-Age Children
# Note: ABIDE-II minimum age is 5.1 years (no infants/toddlers available)
MIN_AGE = 5
MAX_AGE = 10
df_filtered = df[(df['AGE_AT_SCAN'] >= MIN_AGE) & (df['AGE_AT_SCAN'] <= MAX_AGE)].copy()

print(f"\n{'='*60}")
print(f"ABIDE-II fMRI Dataset - YOUNG CHILDREN (Age {MIN_AGE}-{MAX_AGE} years)")
print(f"{'='*60}")
print(f"Total subjects in ABIDE-II: {len(df)}")
print(f"Young children (5-10 years): {len(df_filtered)}")

if len(df_filtered) > 0:
    print(f"\nClass Distribution (Young Children):")
    print(f"  ASD: {int((df_filtered['label']==1).sum())}")
    print(f"  Control: {int((df_filtered['label']==0).sum())}")
    print(f"  Age range: {df_filtered['AGE_AT_SCAN'].min():.1f} - {df_filtered['AGE_AT_SCAN'].max():.1f} years")
    
    print(f"\nExample fMRI subjects from ABIDE-II (Young Children):")
    print(df_filtered[['SITE_ID', 'SUB_ID', 'FILE_ID', 'AGE_AT_SCAN', 'SEX', 'DX_GROUP', 'label']].head(8).to_string())
    
    # Use filtered dataset
    df = df_filtered
else:
    print(f"\n‚ö†Ô∏è  WARNING: No young children found in ABIDE-II in age range {MIN_AGE}-{MAX_AGE} years")
    print(f"Available age range in dataset: {df['AGE_AT_SCAN'].min():.1f} - {df['AGE_AT_SCAN'].max():.1f} years")
    print(f"Using all subjects instead (age filtering disabled)")

‚úì Loaded ABIDE-II phenotypic data: 1114 total subjects

ABIDE-II fMRI Dataset - YOUNG CHILDREN (Age 5-10 years)
Total subjects in ABIDE-II: 1114
Young children (5-10 years): 372

Class Distribution (Young Children):
  ASD: 168
  Control: 204
  Age range: 5.1 - 10.0 years

Example fMRI subjects from ABIDE-II (Young Children):
          SITE_ID  SUB_ID                FILE_ID  AGE_AT_SCAN  SEX  DX_GROUP  label
58  ABIDEII-EMC_1   29864  ABIDEII-EMC_1_0029864     9.013005    1         1      1
60  ABIDEII-EMC_1   29866  ABIDEII-EMC_1_0029866     8.720055    1         1      1
61  ABIDEII-EMC_1   29867  ABIDEII-EMC_1_0029867     8.517454    2         1      1
63  ABIDEII-EMC_1   29869  ABIDEII-EMC_1_0029869     8.793977    1         1      1
64  ABIDEII-EMC_1   29870  ABIDEII-EMC_1_0029870     8.618754    1         1      1
65  ABIDEII-EMC_1   29871  ABIDEII-EMC_1_0029871     7.455168    2         1      1
66  ABIDEII-EMC_1   29872  ABIDEII-EMC_1_0029872     9.147159    1         1      1

In [4]:
# ==============================
# STEP 3: fMRI PREPROCESSING FOR YOUNG CHILDREN
# ==============================
# Optimized for resting-state fMRI (rs-fMRI) data from ABIDE-II
# Processing: 4D -> 3D (temporal mean) -> Resample -> Normalize

def load_and_preprocess_fmri(file_path, target_shape=(64, 64, 64)):
    """
    Load and preprocess resting-state fMRI (rs-fMRI) scan
    Assumes input is DPARSF preprocessed (filt_noglobal strategy)
    
    Args:
        file_path: Path to func_preproc.nii.gz file (4D fMRI volume)
        target_shape: Target dimensions (x, y, z) after resampling
    
    Returns:
        Preprocessed 3D numpy array (single 3D volume) or None if failed
    """
    try:
        from scipy.ndimage import zoom
        
        # Load NIfTI file
        img = nib.load(file_path)
        data = img.get_fdata()
        
        # Verify 4D fMRI (spatial + temporal dimensions)
        if len(data.shape) != 4:
            print(f"  Warning: Expected 4D fMRI, got shape {data.shape}")
            return None
        
        # Take temporal mean to convert 4D -> 3D
        # Reduces temporal noise while preserving spatial structure
        data_3d = np.mean(data, axis=-1)
        
        # Resample to target shape for model input
        if data_3d.shape != target_shape:
            zoom_factors = [t/s for t, s in zip(target_shape, data_3d.shape)]
            data_resampled = zoom(data_3d, zoom_factors, order=1)
        else:
            data_resampled = data_3d
        
        # Normalize: Z-score normalization
        # (mean=0, std=1) for stable neural network training
        data_norm = (data_resampled - data_resampled.mean()) / (data_resampled.std() + 1e-8)
        
        return data_norm
    
    except Exception as e:
        return None


def load_fmri_dataset(df, data_dir, target_shape=(64, 64, 64), max_samples=None):
    """
    Load rs-fMRI dataset (func_preproc) from ABIDE-II
    
    Args:
        df: Phenotypic dataframe with FILE_ID and labels
        data_dir: Directory containing downloaded fMRI files
        target_shape: Resampling target
        max_samples: Maximum number of subjects to load
    
    Returns:
        X (fMRI scans), y (labels), file_ids (subject identifiers)
    """
    X, y, file_ids = [], [], []
    df_subset = df.head(max_samples) if max_samples else df
    
    for idx, row in df_subset.iterrows():
        file_id = row['FILE_ID']
        label = int(row['label'])
        age = row['AGE_AT_SCAN']
        
        # Try multiple file path patterns for func_preproc
        file_path = Path(data_dir) / file_id / f"{file_id}_func_preproc.nii.gz"
        if not file_path.exists():
            file_path = Path(data_dir) / f"{file_id}_func_preproc.nii.gz"
        
        if file_path.exists():
            data = load_and_preprocess_fmri(str(file_path), target_shape)
            if data is not None:
                X.append(data)
                y.append(label)
                file_ids.append(file_id)
    
    if len(X) > 0:
        # Add channel dimension for 3D CNN input: (batch, depth, height, width, channels)
        X = np.array(X)[..., np.newaxis]
        y = np.array(y)
    else:
        X = np.array([]); y = np.array([])
    
    return X, y, file_ids


print("‚úì fMRI preprocessing functions defined (optimized for young children ages 5-10)")

‚úì fMRI preprocessing functions defined (optimized for young children ages 5-10)


In [None]:
# ==============================
# STEP 4: CHECK & LOAD ABIDE-II fMRI DATA (YOUNG CHILDREN)
# ==============================
# Checks for locally available functional preprocessed fMRI (func_preproc) for subjects aged 5-10 years
# ‚ö†Ô∏è  FILES MUST BE MANUALLY DOWNLOADED FROM NITRC (see Cell 2 instructions)

TARGET_SHAPE = (64, 64, 64)  # Standardized 3D fMRI volume size
MAX_SAMPLES = 20  # Check first 20 subjects (or adjust as needed)
DATA_DIR = "./abide2_fmri_data"
PIPELINE = "dparsf"
STRATEGY = "filt_noglobal"

# Ensure phenotypic data is loaded and filtered for young children
if 'df' not in locals() or 'df_filtered' not in locals():
    print("ERROR: Phenotypic data not loaded. Run Cell 5 first.")
else:
    print("=" * 70)
    print("üì• DATA AVAILABILITY CHECK")
    print("=" * 70)
    print(f"‚ö†Ô∏è  ABIDE-II preprocessed data requires manual download from NITRC")
    print(f"   Instructions: See Cell 2 (Problem Statement section)\n")
    
    print(f"Checking for {PIPELINE}/{STRATEGY} functional fMRI files...")
    print(f"Expected location: {DATA_DIR}/[FILE_ID]/[FILE_ID]_func_preproc.nii.gz\n")
    print("-" * 70)
    
    # Check for existing local files
    existing_files = []
    missing_files = []
    
    for idx, (_, row) in enumerate(df_filtered.head(MAX_SAMPLES).iterrows()):
        file_id = row['FILE_ID']
        age = row['AGE_AT_SCAN']
        dx = "ASD" if row['label'] == 1 else "Control"
        
        # Check for file in expected location
        file_path = Path(DATA_DIR) / file_id / f"{file_id}_func_preproc.nii.gz"
        
        if file_path.exists():
            existing_files.append((file_id, age, dx, str(file_path)))
            print(f"[{idx+1:2d}] ‚úì {file_id} (Age: {age:.1f}, {dx})")
        else:
            missing_files.append((file_id, age, dx))
            print(f"[{idx+1:2d}] ‚úó {file_id} (Age: {age:.1f}, {dx}) - NOT FOUND")
    
    print("-" * 70)
    print(f"\nüìä SUMMARY:")
    print(f"   Found: {len(existing_files)} / {MAX_SAMPLES} requested files")
    print(f"   Missing: {len(missing_files)} / {MAX_SAMPLES} requested files")
    
    # Load existing fMRI files into memory
    if len(existing_files) > 0:
        print(f"\nüîÑ Loading {len(existing_files)} preprocessed fMRI scans into memory...")
        X, y, file_ids = load_fmri_dataset(df_filtered.head(MAX_SAMPLES), DATA_DIR, 
                                           target_shape=TARGET_SHAPE, max_samples=MAX_SAMPLES)
        
        if len(X) > 0:
            print(f"‚úÖ Dataset loaded successfully!")
            print(f"   Shape: X={X.shape}, y={y.shape}")
            print(f"   ASD samples: {int((y == 1).sum())}")
            print(f"   Control samples: {int((y == 0).sum())}")
            print(f"\n‚úì Ready for training! Proceed to Cell 8.")
        else:
            print(f"‚ùå No fMRI files could be loaded. Check file paths and NIfTI format.")
    else:
        print("\n‚ùå NO FILES FOUND")
        print("=" * 70)
        print("üì• REQUIRED ACTION:")
        print("=" * 70)
        print("1. Register at: https://fcon_1000.projects.nitrc.org/indi/abide/")
        print("2. Download DPARSF filt_noglobal preprocessed files for subjects listed above")
        print("3. Extract files to ./abide2_fmri_data/ following this structure:")
        print("   ./abide2_fmri_data/")
        print("   ‚îú‚îÄ‚îÄ SITE_ID_SUBID/")
        print("   ‚îÇ   ‚îî‚îÄ‚îÄ SITE_ID_SUBID_func_preproc.nii.gz")
        print("4. Re-run this cell to verify files are detected")
        print("=" * 70)

Loading dparsf/filt_noglobal functional fMRI data for 372 young children...
------------------------------------------------------------
Downloading ABIDEII-EMC_1_0029864... [1] ‚úó ABIDEII-EMC_1_0029864 (Age: 9.0, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029866... [2] ‚úó ABIDEII-EMC_1_0029866 (Age: 8.7, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029867... [3] ‚úó ABIDEII-EMC_1_0029867 (Age: 8.5, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029869... [4] ‚úó ABIDEII-EMC_1_0029869 (Age: 8.8, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029870... [5] ‚úó ABIDEII-EMC_1_0029870 (Age: 8.6, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029871... [6] ‚úó ABIDEII-EMC_1_0029871 (Age: 7.5, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029872... [7] ‚úó ABIDEII-EMC_1_0029872 (Age: 9.1, ASD) - File not found on AWS S3
Downloading ABIDEII-EMC_1_0029873... [8] ‚úó ABIDEII-EMC_1_0029873 (Age: 6.8, ASD) - File not

In [6]:
# ==============================
# STEP 5: VISUALIZE fMRI SAMPLES (YOUNG CHILDREN)
# ==============================
# Display 3D fMRI slices for representative ASD and Control subjects

def visualize_fmri_slice(data_3d, title, cmap='gray'):
    """
    Visualize 3D fMRI volume with 3 orthogonal slices
    
    Args:
        data_3d: 3D numpy array (preprocessed fMRI)
        title: Plot title
        cmap: Colormap
    """
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Get middle slice indices
    d, h, w = data_3d.shape
    z_mid, y_mid, x_mid = d // 2, h // 2, w // 2
    
    # Axial slice (horizontal)
    axes[0].imshow(data_3d[z_mid, :, :], cmap=cmap)
    axes[0].set_title(f"Axial (z={z_mid})")
    axes[0].axis('off')
    
    # Coronal slice (front-to-back)
    axes[1].imshow(data_3d[:, y_mid, :], cmap=cmap)
    axes[1].set_title(f"Coronal (y={y_mid})")
    axes[1].axis('off')
    
    # Sagittal slice (left-right)
    axes[2].imshow(data_3d[:, :, x_mid], cmap=cmap)
    axes[2].set_title(f"Sagittal (x={x_mid})")
    axes[2].axis('off')
    
    fig.suptitle(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

# Check if fMRI data is loaded
if 'X' in locals() and len(X) > 0:
    print("Visualizing sample fMRI scans from young children dataset:")
    print("=" * 60)
    
    # Find first ASD and Control samples
    asd_idx = np.where(y == 1)[0]
    control_idx = np.where(y == 0)[0]
    
    if len(asd_idx) > 0:
        asd_sample = X[asd_idx[0], :, :, :, 0]  # Remove channel dimension
        visualize_fmri_slice(asd_sample, f"ASD Sample (ID: {file_ids[asd_idx[0]]})")
    else:
        print("‚ö† No ASD samples available for visualization")
    
    if len(control_idx) > 0:
        control_sample = X[control_idx[0], :, :, :, 0]  # Remove channel dimension
        visualize_fmri_slice(control_sample, f"Control Sample (ID: {file_ids[control_idx[0]]})")
    else:
        print("‚ö† No Control samples available for visualization")
else:
    print("‚Ñπ fMRI data not yet loaded. Run Cell 4 first to download and load data.")

‚Ñπ fMRI data not yet loaded. Run Cell 4 first to download and load data.


In [7]:
# ==============================
# STEP 6: TRAIN / TEST SPLIT
# ==============================

if len(X) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    print(f"Train - ASD: {np.sum(y_train)}, Control: {len(y_train)-np.sum(y_train)}")
    print(f"Test - ASD: {np.sum(y_test)}, Control: {len(y_test)-np.sum(y_test)}")
else:
    print("‚ùå No data available for train/test split")

NameError: name 'X' is not defined

In [None]:
# ==============================
# STEP 7: BUILD 3D CNN MODEL
# ==============================

if len(X) > 0 and 'X_train' in locals():
    def build_3d_cnn(input_shape):
        """Build 3D CNN for fMRI classification"""
        model = Sequential([
            Conv3D(32, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
            BatchNormalization(),
            Conv3D(32, kernel_size=3, activation='relu', padding='same'),
            BatchNormalization(),
            MaxPooling3D(pool_size=2),
            Dropout(0.25),
            
            Conv3D(64, kernel_size=3, activation='relu', padding='same'),
            BatchNormalization(),
            Conv3D(64, kernel_size=3, activation='relu', padding='same'),
            BatchNormalization(),
            MaxPooling3D(pool_size=2),
            Dropout(0.25),
            
            Conv3D(128, kernel_size=3, activation='relu', padding='same'),
            BatchNormalization(),
            MaxPooling3D(pool_size=2),
            Dropout(0.25),
            
            GlobalAveragePooling3D(),
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.5),
            Dense(128, activation='relu'),
            BatchNormalization(),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
        return model
    
    model = build_3d_cnn(input_shape=X_train.shape[1:])
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', 
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    print("‚úì 3D CNN model built and compiled")
    model.summary()
else:
    print("‚ùå Cannot build model. Run cells 4-6 first.")

In [None]:
# ==============================
# STEP 8: SETUP CALLBACKS
# ==============================

if 'model' in locals():
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
        ModelCheckpoint('best_3d_cnn_asd.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1)
    ]
    print("‚úì Callbacks configured")
else:
    print("‚ùå Model not found. Run cell 7 first.")

In [None]:
# ==============================
# STEP 9: TRAIN MODEL
# ==============================

if 'model' in locals() and 'callbacks' in locals():
    EPOCHS = 100
    BATCH_SIZE = 4
    
    print(f"Training 3D CNN on {len(X_train)} subjects...")
    print(f"Epochs: {EPOCHS}, Batch size: {BATCH_SIZE}\n")
    
    history = model.fit(X_train, y_train, validation_split=0.2, epochs=EPOCHS, 
                        batch_size=BATCH_SIZE, callbacks=callbacks, verbose=1)
    print("\n‚úì Training complete!")
else:
    print("‚ùå Model or callbacks not found. Run cells 7-8 first.")

In [None]:
# ==============================
# STEP 10: PLOT TRAINING HISTORY
# ==============================

if 'history' in locals():
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    axes[0].plot(history.history['accuracy'], label='Train')
    axes[0].plot(history.history['val_accuracy'], label='Validation')
    axes[0].set_title('Accuracy')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Accuracy')
    axes[0].legend()
    axes[0].grid(True)
    
    axes[1].plot(history.history['loss'], label='Train')
    axes[1].plot(history.history['val_loss'], label='Validation')
    axes[1].set_title('Loss')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Loss')
    axes[1].legend()
    axes[1].grid(True)
    
    axes[2].plot(history.history['auc'], label='Train')
    axes[2].plot(history.history['val_auc'], label='Validation')
    axes[2].set_title('AUC')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('AUC')
    axes[2].legend()
    axes[2].grid(True)
    
    plt.tight_layout()
    plt.show()
else:
    print("‚ùå Training history not found. Run cell 9 first.")

In [None]:
# ==============================
# STEP 11: EVALUATION
# ==============================

if 'model' in locals() and 'X_test' in locals():
    y_pred_prob = model.predict(X_test).ravel()
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    acc = accuracy_score(y_test, y_pred) * 100
    roc = roc_auc_score(y_test, y_pred_prob)
    
    print("="*50)
    print("3D CNN fMRI RESULTS (ABIDE-II)")
    print("="*50)
    print(f"Accuracy: {acc:.2f}%")
    print(f"ROC-AUC: {roc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Control', 'ASD']))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Control', 'ASD'], yticklabels=['Control', 'ASD'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
else:
    print("‚ùå Model or test set not found. Run cells 6-9 first.")

In [None]:
# ==============================
# STEP 12: SAVE MODEL
# ==============================

if 'model' in locals():
    model.save("final_3d_cnn_asd_abide2.keras")
    print("‚úì Model saved: final_3d_cnn_asd_abide2.keras")
    
    if 'history' in locals():
        pd.DataFrame(history.history).to_csv("training_history_abide2.csv", index=False)
        print("‚úì History saved: training_history_abide2.csv")
else:
    print("‚ùå Model not found")

## Next Steps:

1. **Download full ABIDE-II dataset** (500+ subjects)
2. **Data augmentation**: Rotation, flipping for better generalization
3. **Transfer learning**: Use pretrained 3D ResNet or Med3D
4. **Ensemble**: Combine multiple models
5. **Explainability**: Use GradCAM to visualize important brain regions

## Important Notes:

- **Memory**: 3D CNNs require significant GPU memory (8GB+ recommended)
- **Training time**: Expect hours on CPU, minutes on GPU
- **Data quality**: Preprocessing is critical for fMRI analysis
- **Class imbalance**: Consider weighted loss or oversampling

In [8]:
# ==============================
# HELPER: Generate download checklist + scaffold directories
# ==============================
# Produces `abideII_5-10_download_checklist.csv` and creates per-subject folders in ./abide2_fmri_data/

from pathlib import Path
import pandas as pd

if 'df_filtered' not in locals():
    print("ERROR: Phenotypic data not loaded. Run Cell 5 first.")
else:
    checklist = df_filtered[['FILE_ID','SITE_ID','AGE_AT_SCAN','label']].copy()
    checklist['Diagnosis'] = checklist['label'].map({1:'ASD', 0:'Control'})
    checklist = checklist[['FILE_ID','SITE_ID','AGE_AT_SCAN','Diagnosis']]

    out_csv = Path('abideII_5-10_download_checklist.csv')
    checklist.to_csv(out_csv, index=False)
    print(f"‚úì Checklist written: {out_csv.resolve()}")
    print(f"  Total subjects: {len(checklist)} | ASD: {int((df_filtered['label']==1).sum())} | Control: {int((df_filtered['label']==0).sum())}")

    base = Path('./abide2_fmri_data')
    base.mkdir(exist_ok=True)
    created = 0
    for fid in df_filtered['FILE_ID'].tolist():
        d = base / fid
        if not d.exists():
            d.mkdir(parents=True, exist_ok=True)
            created += 1
    print(f"‚úì Directory scaffold complete. Created {created} folders under {base}")
    print("Place files using pattern: ./abide2_fmri_data/[FILE_ID]/[FILE_ID]_func_preproc.nii.gz")

‚úì Checklist written: C:\Users\eredd\Desktop\FYP\abideII_5-10_download_checklist.csv
  Total subjects: 372 | ASD: 168 | Control: 204
‚úì Directory scaffold complete. Created 352 folders under abide2_fmri_data
Place files using pattern: ./abide2_fmri_data/[FILE_ID]/[FILE_ID]_func_preproc.nii.gz


In [11]:
# ==============================
# OPTION 2: Use ABIDE-I (AWS S3) for automatic downloads
# ==============================
# Loads ABIDE-I phenotypic data and downloads preprocessed functional fMRI
# Tries cpac then dparsf with filt_noglobal from fcp-indi S3 (public)

import urllib.request
from pathlib import Path
import pandas as pd
import numpy as np

ABIDE1_PHENO = "Phenotypic_V1_0b.csv"
ABIDE1_DATA_DIR = "./abide1_fmri_data"
PIPELINES = ["cpac", "dparsf"]  # S3 uses lowercase pipeline names
STRATEGY = "filt_noglobal"
ABIDE1_MAX_SAMPLES = 10
BASE_URL = "https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs"

# Downloader that tries multiple pipelines
def download_abide1_fmri_s3(file_id, output_dir=ABIDE1_DATA_DIR):
    out_dir = Path(output_dir) / file_id
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"{file_id}_func_preproc.nii.gz"

    if out_file.exists():
        return str(out_file)

    for pipeline in PIPELINES:
        url = f"{BASE_URL}/{pipeline}/{STRATEGY}/func_preproc/{file_id}_func_preproc.nii.gz"
        try:
            urllib.request.urlretrieve(url, out_file)
            return str(out_file)
        except Exception:
            if out_file.exists():
                out_file.unlink()
            continue
    return None

# Load ABIDE-I phenotypic
try:
    df1 = pd.read_csv(ABIDE1_PHENO)
    df1.columns = df1.columns.str.strip()
    print(f"‚úì Loaded ABIDE-I phenotypic data: {df1.shape[0]} subjects")
except FileNotFoundError:
    print(f"‚ùå Missing {ABIDE1_PHENO}. Please place it in the workspace.")

# Labels and FILE_ID
if 'df1' in locals():
    df1['label'] = df1['DX_GROUP'].map({1:1, 2:0})
    df1 = df1.dropna(subset=['label'])
    df1['FILE_ID'] = df1['SITE_ID'].astype(str) + '_' + df1['SUB_ID'].astype(str).str.zfill(7)

    MIN_AGE, MAX_AGE = 5, 10
    df1f = df1[(df1['AGE_AT_SCAN'] >= MIN_AGE) & (df1['AGE_AT_SCAN'] <= MAX_AGE)].copy()
    print(f"Young children (ABIDE-I, {MIN_AGE}-{MAX_AGE}y): {len(df1f)}")
    if len(df1f) == 0:
        print("‚ö†Ô∏è No 5-10 y subjects in ABIDE-I. Using first N subjects for testing downloads.")
        df1f = df1.head(ABIDE1_MAX_SAMPLES).copy()

    print(f"\nAttempting downloads from S3: pipelines={PIPELINES}, strategy={STRATEGY}")
    ok, fail = 0, 0
    for i, (_, row) in enumerate(df1f.head(ABIDE1_MAX_SAMPLES).iterrows(), start=1):
        fid = row['FILE_ID']
        age = row['AGE_AT_SCAN']
        dx = 'ASD' if row['label']==1 else 'Control'
        path = download_abide1_fmri_s3(fid)
        if path:
            ok += 1
            print(f"[{i:02d}] ‚úì {fid} (Age {age:.1f}, {dx})")
        else:
            fail += 1
            print(f"[{i:02d}] ‚úó {fid} (Age {age:.1f}, {dx}) - not found on S3 (cpac/dparsf)")

    print(f"\nSummary: {ok} succeeded, {fail} failed")

    # Load successfully downloaded files (if any)
    if ok > 0:
        X1, y1, fids1 = load_fmri_dataset(df1f.head(ABIDE1_MAX_SAMPLES), ABIDE1_DATA_DIR, target_shape=(64,64,64), max_samples=ABIDE1_MAX_SAMPLES)
        if len(X1) > 0:
            print(f"\n‚úì ABIDE-I dataset loaded: X={X1.shape}, y={y1.shape}")
            print(f"   ASD: {int((y1==1).sum())} | Control: {int((y1==0).sum())}")
        else:
            print("‚ùå Downloaded files could not be loaded. Check file paths.")

‚úì Loaded ABIDE-I phenotypic data: 1112 subjects
Young children (ABIDE-I, 5-10y): 150

Attempting downloads from S3: pipelines=['cpac', 'dparsf'], strategy=filt_noglobal
[01] ‚úì KKI_0050776 (Age 9.3, Control)
[02] ‚úì KKI_0050777 (Age 8.4, Control)
[03] ‚úì KKI_0050778 (Age 9.7, Control)
[04] ‚úì KKI_0050779 (Age 9.4, Control)
[05] ‚úì KKI_0050780 (Age 9.8, Control)
[06] ‚úì KKI_0050781 (Age 9.3, Control)
[07] ‚úì KKI_0050784 (Age 8.1, Control)
[08] ‚úì KKI_0050786 (Age 8.8, Control)
[09] ‚úì KKI_0050789 (Age 9.3, Control)
[10] ‚úì KKI_0050790 (Age 8.8, Control)

Summary: 10 succeeded, 0 failed

‚úì ABIDE-I dataset loaded: X=(10, 64, 64, 64, 1), y=(10,)
   ASD: 0 | Control: 10


In [1]:
# ==============================
# REMOTE ACCESS VERIFIER: ABIDE-I (S3) and ABIDE-II (NITRC/S3)
# ==============================
# Checks URL accessibility without downloading. Fast sampling mode.
# - ABIDE-I: Public S3 (pipelines: cpac, dparsf)
# - ABIDE-II: Expected not on S3 (NITRC requires login)

import urllib.request, urllib.error
import pandas as pd
from pathlib import Path

ABIDE1_PHENO = "Phenotypic_V1_0b.csv"
ABIDE2_PHENO = "ABIDEII_Composite_Phenotypic.csv"
S3_BASE = "https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs"
PIPELINES = ["cpac", "dparsf"]
STRATEGIES = ["filt_noglobal"]
DERIVATIVE = "func_preproc"
SAMPLE_SIZE = 30  # Check only first 30 subjects per dataset for speed

REPORT_CSV = Path("remote_access_report.csv")

def url_head_exists(url: str, timeout: int = 5):
    try:
        req = urllib.request.Request(url, method='HEAD')
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            return True, resp.status
    except urllib.error.HTTPError as e:
        return False, e.code
    except Exception:
        return False, None

# Load phenotypes
df1, df2 = None, None
try:
    df1 = pd.read_csv(ABIDE1_PHENO)
    df1.columns = df1.columns.str.strip()
    df1['label'] = df1['DX_GROUP'].map({1:1, 2:0})
    df1 = df1.dropna(subset=['label'])
    df1['FILE_ID'] = df1['SITE_ID'].astype(str) + '_' + df1['SUB_ID'].astype(str).str.zfill(7)
    print(f"‚úì ABIDE-I loaded: {len(df1)} subjects (sampling {SAMPLE_SIZE})")
except Exception as e:
    print(f"‚ùå Could not load {ABIDE1_PHENO}: {e}")

try:
    df2 = pd.read_csv(ABIDE2_PHENO, encoding='latin1')
    df2.columns = df2.columns.str.strip()
    df2['label'] = df2['DX_GROUP'].map({1:1, 2:0})
    df2 = df2.dropna(subset=['label'])
    df2['FILE_ID'] = df2['SITE_ID'].astype(str) + '_' + df2['SUB_ID'].astype(str).str.zfill(7)
    print(f"‚úì ABIDE-II loaded: {len(df2)} subjects (sampling {SAMPLE_SIZE})")
except Exception as e:
    print(f"‚ùå Could not load {ABIDE2_PHENO}: {e}")

rows = []

# Verify ABIDE-I (public S3) - SAMPLE ONLY
if df1 is not None:
    print(f"\nVerifying ABIDE-I S3 access ({SAMPLE_SIZE} samples)...")
    for i, (_, r) in enumerate(df1.head(SAMPLE_SIZE).iterrows(), start=1):
        fid = r['FILE_ID']
        age = r.get('AGE_AT_SCAN', None)
        diagnosis = 'ASD' if int(r['label']) == 1 else 'Control'
        status = 'not_found'
        chosen_pipeline = None
        url_used = None
        
        for pipe in PIPELINES:
            for strat in STRATEGIES:
                url = f"{S3_BASE}/{pipe}/{strat}/{DERIVATIVE}/{fid}_{DERIVATIVE}.nii.gz"
                ok, code = url_head_exists(url)
                if ok:
                    status = 'accessible'
                    chosen_pipeline = pipe
                    url_used = url
                    break
            if status == 'accessible':
                break
        
        rows.append({
            'source': 'ABIDE-I',
            'FILE_ID': fid,
            'age': age,
            'diagnosis': diagnosis,
            'status': status,
            'pipeline': chosen_pipeline,
            'strategy': strat if chosen_pipeline else None,
            'url': url_used
        })
        
        if i % 10 == 0:
            print(f"  [{i}/{SAMPLE_SIZE}] {fid}: {status}")
    
    accessible = sum(1 for r in rows if r['source']=='ABIDE-I' and r['status']=='accessible')
    print(f"‚úì ABIDE-I: {accessible}/{SAMPLE_SIZE} accessible via S3")

# Verify ABIDE-II (expected not on S3) - SAMPLE ONLY
if df2 is not None:
    print(f"\nVerifying ABIDE-II S3 check ({SAMPLE_SIZE} samples)...")
    for i, (_, r) in enumerate(df2.head(SAMPLE_SIZE).iterrows(), start=1):
        fid = r['FILE_ID']
        age = r.get('AGE_AT_SCAN', None)
        diagnosis = 'ASD' if int(r['label']) == 1 else 'Control'
        fid_s3 = fid.replace('ABIDEII-', '') if fid.startswith('ABIDEII-') else fid
        url = f"{S3_BASE}/dparsf/filt_noglobal/{DERIVATIVE}/{fid_s3}_{DERIVATIVE}.nii.gz"
        ok, code = url_head_exists(url)
        status = 'accessible' if ok else ('forbidden' if code == 403 else 'not_found')
        
        rows.append({
            'source': 'ABIDE-II',
            'FILE_ID': fid,
            'age': age,
            'diagnosis': diagnosis,
            'status': status,
            'pipeline': 'dparsf',
            'strategy': 'filt_noglobal',
            'url': url
        })
        
        if i % 10 == 0:
            print(f"  [{i}/{SAMPLE_SIZE}] {fid}: {status}")
    
    accessible = sum(1 for r in rows if r['source']=='ABIDE-II' and r['status']=='accessible')
    print(f"‚úì ABIDE-II: {accessible}/{SAMPLE_SIZE} accessible via S3 (expected 0)")

report = pd.DataFrame(rows)
report.to_csv(REPORT_CSV, index=False)

print(f"\nüìÑ Report saved: {REPORT_CSV.resolve()}")
print(f"\nSummary:")
print(report.groupby(['source','status']).size().to_frame('count'))
print(f"\nFirst 10 entries:")
print(report.head(10)[['source','FILE_ID','diagnosis','status','pipeline']].to_string(index=False))

‚úì ABIDE-I loaded: 1112 subjects (sampling 30)
‚úì ABIDE-II loaded: 1114 subjects (sampling 30)

Verifying ABIDE-I S3 access (30 samples)...
  [10/30] CALTECH_0051465: not_found
  [20/30] CALTECH_0051475: not_found
  [30/30] CALTECH_0051485: not_found
‚úì ABIDE-I: 0/30 accessible via S3

Verifying ABIDE-II S3 check (30 samples)...
  [10/30] ABIDEII-BNI_1_0029025: not_found
  [20/30] ABIDEII-BNI_1_0029042: not_found
  [30/30] ABIDEII-BNI_1_0029011: not_found
‚úì ABIDE-II: 0/30 accessible via S3 (expected 0)

üìÑ Report saved: C:\Users\eredd\Desktop\FYP\remote_access_report.csv

Summary:
                    count
source   status          
ABIDE-I  not_found     30
ABIDE-II not_found     30

First 10 entries:
 source         FILE_ID diagnosis    status pipeline
ABIDE-I CALTECH_0051456       ASD not_found     None
ABIDE-I CALTECH_0051457       ASD not_found     None
ABIDE-I CALTECH_0051458       ASD not_found     None
ABIDE-I CALTECH_0051459       ASD not_found     None
ABIDE-I CALTECH_0