# OASIS Cross-Sectional MRI Dataset to DataFrame Converter

This notebook processes the OASIS dataset and converts it into pandas DataFrames for analysis.

In [None]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import re

## 1. Define Helper Functions

In [None]:
def parse_txt_file(txt_path):
    """Parse the .txt file to extract subject metadata."""
    data = {}
    
    with open(txt_path, 'r') as f:
        content = f.read()
    
    # Extract basic info
    patterns = {
        'SESSION_ID': r'SESSION ID:\s+(.+)',
        'AGE': r'AGE:\s+(\d+)',
        'M/F': r'M/F:\s+(\w+)',
        'HAND': r'HAND:\s+(\w+)',
        'EDUC': r'EDUC:\s+(\d+)',
        'SES': r'SES:\s+(\d+)',
        'CDR': r'CDR:\s+([\d.]+)',
        'MMSE': r'MMSE:\s+(\d+)',
        'eTIV': r'eTIV:\s+([\d.]+)',
        'ASF': r'ASF:\s+([\d.]+)',
        'nWBV': r'nWBV:\s+([\d.]+)'
    }
    
    for key, pattern in patterns.items():
        match = re.search(pattern, content)
        if match:
            data[key] = match.group(1)
    
    return data

In [None]:
def parse_xml_file(xml_path):
    """Parse the .xml file to extract scan and processing information."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Define namespace
    ns = {'xnat': 'http://nrg.wustl.edu/xnat',
          'oasis': 'http://nmr.mgh.harvard.edu/oasis'}
    
    data = {
        'session_id': root.get('ID'),
        'subject_id': root.find('xnat:subject_ID', ns).text if root.find('xnat:subject_ID', ns) is not None else None,
        'scans': [],
        'reconstructions': [],
        'assessors': []
    }
    
    # Parse scans
    scans = root.find('xnat:scans', ns)
    if scans is not None:
        for scan in scans.findall('xnat:scan', ns):
            scan_data = {
                'scan_id': scan.get('ID'),
                'type': scan.get('type'),
                'quality': scan.find('xnat:quality', ns).text if scan.find('xnat:quality', ns) is not None else None
            }
            
            # Parse parameters
            params = scan.find('xnat:parameters', ns)
            if params is not None:
                voxel_res = params.find('xnat:voxelRes', ns)
                if voxel_res is not None:
                    scan_data['voxel_x'] = voxel_res.get('x')
                    scan_data['voxel_y'] = voxel_res.get('y')
                    scan_data['voxel_z'] = voxel_res.get('z')
                
                scan_data['orientation'] = params.find('xnat:orientation', ns).text if params.find('xnat:orientation', ns) is not None else None
                scan_data['tr'] = params.find('xnat:tr', ns).text if params.find('xnat:tr', ns) is not None else None
                scan_data['te'] = params.find('xnat:te', ns).text if params.find('xnat:te', ns) is not None else None
                scan_data['ti'] = params.find('xnat:ti', ns).text if params.find('xnat:ti', ns) is not None else None
                scan_data['flip'] = params.find('xnat:flip', ns).text if params.find('xnat:flip', ns) is not None else None
            
            data['scans'].append(scan_data)
    
    # Parse assessors
    assessors = root.find('xnat:assessors', ns)
    if assessors is not None:
        for assessor in assessors.findall('xnat:assessor', ns):
            assessor_type = assessor.get('{http://www.w3.org/2001/XMLSchema-instance}type')
            assessor_data = {
                'assessor_id': assessor.get('ID'),
                'type': assessor_type
            }
            
            # Atlas Scaling Factor
            if 'atlasScalingFactorData' in assessor_type:
                scaling_factor = assessor.find('oasis:scalingFactor', ns)
                eicv = assessor.find('oasis:eICV', ns)
                if scaling_factor is not None:
                    assessor_data['scaling_factor'] = scaling_factor.text
                if eicv is not None:
                    assessor_data['eICV'] = eicv.text
            
            # Segmentation data
            if 'segmentationFastData' in assessor_type:
                brain_percent = assessor.get('brainPercent')
                if brain_percent:
                    assessor_data['brain_percent'] = brain_percent
            
            data['assessors'].append(assessor_data)
    
    return data

## 2. Process All Data Files

In [None]:
def process_oasis_dataset(data_dir):
    """Process all OASIS dataset files and create dataframes."""
    
    subjects_data = []
    scans_data = []
    assessors_data = []
    
    # Find all disc directories
    data_path = Path(data_dir)
    disc_dirs = [d for d in data_path.iterdir() if d.is_dir() and 'disc' in d.name.lower()]
    
    for disc_dir in disc_dirs:
        print(f"Processing {disc_dir.name}...")
        
        # Navigate to the actual disc folder (e.g., disc1, disc2)
        disc_folders = [d for d in disc_dir.iterdir() if d.is_dir() and d.name.startswith('disc')]
        
        for disc_folder in disc_folders:
            # Find all subject directories (OAS1_XXXX_MR1)
            subject_dirs = [d for d in disc_folder.iterdir() if d.is_dir() and d.name.startswith('OAS1_')]
            
            for subject_dir in subject_dirs:
                subject_id = subject_dir.name
                txt_file = subject_dir / f"{subject_id}.txt"
                xml_file = subject_dir / f"{subject_id}.xml"
                
                if txt_file.exists() and xml_file.exists():
                    # Parse txt file
                    txt_data = parse_txt_file(txt_file)
                    
                    # Parse xml file
                    xml_data = parse_xml_file(xml_file)
                    
                    # Combine subject data
                    subject_record = {**txt_data, 'disc': disc_dir.name}
                    subjects_data.append(subject_record)
                    
                    # Add scans data
                    for scan in xml_data['scans']:
                        scan_record = {'subject_id': subject_id, **scan}
                        scans_data.append(scan_record)
                    
                    # Add assessors data
                    for assessor in xml_data['assessors']:
                        assessor_record = {'subject_id': subject_id, **assessor}
                        assessors_data.append(assessor_record)
    
    # Create DataFrames
    df_subjects = pd.DataFrame(subjects_data)
    df_scans = pd.DataFrame(scans_data)
    df_assessors = pd.DataFrame(assessors_data)
    
    return df_subjects, df_scans, df_assessors

## 3. Run Processing

In [None]:
# Set the data directory path
data_dir = './data'

# Process the dataset
df_subjects, df_scans, df_assessors = process_oasis_dataset(data_dir)

print(f"\nProcessing complete!")
print(f"Subjects: {len(df_subjects)} records")
print(f"Scans: {len(df_scans)} records")
print(f"Assessors: {len(df_assessors)} records")

## 4. Explore the DataFrames

In [None]:
# Display subjects dataframe
print("\n=== SUBJECTS DATAFRAME ===")
print(df_subjects.head())
print(f"\nShape: {df_subjects.shape}")
print(f"\nColumns: {df_subjects.columns.tolist()}")

In [None]:
# Display scans dataframe
print("\n=== SCANS DATAFRAME ===")
print(df_scans.head())
print(f"\nShape: {df_scans.shape}")
print(f"\nColumns: {df_scans.columns.tolist()}")

In [None]:
# Display assessors dataframe
print("\n=== ASSESSORS DATAFRAME ===")
print(df_assessors.head())
print(f"\nShape: {df_assessors.shape}")
print(f"\nColumns: {df_assessors.columns.tolist()}")

## 5. Data Type Conversion and Cleaning

In [None]:
# Convert numeric columns to appropriate types
numeric_cols = ['AGE', 'EDUC', 'SES', 'CDR', 'MMSE', 'eTIV', 'ASF', 'nWBV']
for col in numeric_cols:
    if col in df_subjects.columns:
        df_subjects[col] = pd.to_numeric(df_subjects[col], errors='coerce')

# Convert scan parameters to numeric
scan_numeric_cols = ['voxel_x', 'voxel_y', 'voxel_z', 'tr', 'te', 'ti', 'flip']
for col in scan_numeric_cols:
    if col in df_scans.columns:
        df_scans[col] = pd.to_numeric(df_scans[col], errors='coerce')

# Convert assessor values to numeric
assessor_numeric_cols = ['scaling_factor', 'eICV', 'brain_percent']
for col in assessor_numeric_cols:
    if col in df_assessors.columns:
        df_assessors[col] = pd.to_numeric(df_assessors[col], errors='coerce')

print("Data types converted successfully!")

## 6. Basic Statistics

In [None]:
# Subject demographics
print("\n=== SUBJECT DEMOGRAPHICS ===")
print(f"\nAge statistics:")
print(df_subjects['AGE'].describe())

print(f"\nGender distribution:")
print(df_subjects['M/F'].value_counts())

print(f"\nCDR (Clinical Dementia Rating) distribution:")
print(df_subjects['CDR'].value_counts().sort_index())

## 7. Save DataFrames to CSV

In [None]:
# Save to CSV files
df_subjects.to_csv('oasis_subjects.csv', index=False)
df_scans.to_csv('oasis_scans.csv', index=False)
df_assessors.to_csv('oasis_assessors.csv', index=False)

print("\nDataFrames saved to CSV files:")
print("- oasis_subjects.csv")
print("- oasis_scans.csv")
print("- oasis_assessors.csv")