# Data Exploration - NIH ChestX-ray14 Dataset

This notebook explores the NIH ChestX-ray14 dataset to understand:
- Dataset structure and size
- Class distribution and imbalance
- Image characteristics and quality
- Data preprocessing requirements

**Authors:** Sneh Gupta and Arpit Bhardwaj  
**Course:** CSET211 - Statistical Machine Learning

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import warnings
from pathlib import Path
from collections import Counter

# Add src to path
sys.path.append('../src')

# Set style
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

# Configuration
DATA_DIR = '../data/raw'
IMAGES_DIR = os.path.join(DATA_DIR, 'images')
LABELS_FILE = os.path.join(DATA_DIR, 'Data_Entry_2017_v2020.csv')

print(f"Data directory: {DATA_DIR}")
print(f"Images directory exists: {os.path.exists(IMAGES_DIR)}")
print(f"Labels file exists: {os.path.exists(LABELS_FILE)}")

## 1. Dataset Overview

In [None]:
# Load the labels CSV file
if os.path.exists(LABELS_FILE):
    df = pd.read_csv(LABELS_FILE)
    print(f"Dataset shape: {df.shape}")
    print(f"Number of unique patients: {df['Patient ID'].nunique()}")
    print(f"Number of unique images: {df['Image Index'].nunique()}")
    
    # Display basic info
    print("\nDataset Info:")
    print(df.info())
    
    # Display first few rows
    print("\nFirst 5 rows:")
    df.head()
else:
    print("Labels file not found. Please download the NIH ChestX-ray14 dataset.")

In [None]:
# Analyze columns
if 'df' in locals():
    print("Column Analysis:")
    print("=================")
    
    for col in df.columns:
        print(f"\n{col}:")
        print(f"  - Type: {df[col].dtype}")
        print(f"  - Unique values: {df[col].nunique()}")
        print(f"  - Missing values: {df[col].isnull().sum()}")
        
        if col == 'Finding Labels':
            unique_findings = set()
            for finding in df[col]:
                if pd.notna(finding):
                    unique_findings.update(finding.split('|'))
            print(f"  - Unique findings: {len(unique_findings)}")
            print(f"  - Sample findings: {list(unique_findings)[:5]}")

## 2. Class Distribution Analysis

In [None]:
# Analyze finding labels distribution
if 'df' in locals():
    # Count all findings
    all_findings = []
    for finding in df['Finding Labels']:
        if pd.notna(finding):
            all_findings.extend(finding.split('|'))
    
    finding_counts = Counter(all_findings)
    
    # Create DataFrame for visualization
    findings_df = pd.DataFrame([
        {'Finding': finding, 'Count': count}
        for finding, count in finding_counts.most_common()
    ])
    
    print(f"Total findings: {len(finding_counts)}")
    print("\nMost common findings:")
    print(findings_df.head(10))
    
    # Plot distribution
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 1, 1)
    sns.barplot(data=findings_df.head(14), x='Count', y='Finding')
    plt.title('Distribution of All 14 Pathologies in NIH ChestX-ray14')
    plt.xlabel('Number of Images')
    
    # Focus on cancer-related findings
    cancer_related = ['Mass', 'Nodule']
    cancer_counts = {finding: finding_counts[finding] for finding in cancer_related if finding in finding_counts}
    
    plt.subplot(2, 1, 2)
    if cancer_counts:
        plt.bar(cancer_counts.keys(), cancer_counts.values(), color=['red', 'orange'])
        plt.title('Cancer-Related Findings (Mass, Nodule)')
        plt.ylabel('Number of Images')
        
        # Add value labels on bars
        for finding, count in cancer_counts.items():
            plt.text(finding, count + 50, str(count), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Create binary classification labels (Cancer vs No Cancer)
if 'df' in locals():
    cancer_labels = ['Mass', 'Nodule']
    
    # Function to check if any cancer label is present
    def has_cancer(finding_labels):
        if pd.isna(finding_labels):
            return 0
        return int(any(label in finding_labels for label in cancer_labels))
    
    # Create binary labels
    df['Cancer'] = df['Finding Labels'].apply(has_cancer)
    
    # Show class distribution
    print("Binary Classification Distribution:")
    print(df['Cancer'].value_counts())
    print(f"\nPercentage distribution:")
    print(df['Cancer'].value_counts(normalize=True) * 100)
    
    # Visualize
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    df['Cancer'].value_counts().plot(kind='bar', color=['lightblue', 'salmon'])
    plt.title('Binary Cancer Classification')
    plt.xlabel('Class (0=No Cancer, 1=Cancer)')
    plt.ylabel('Number of Images')
    plt.xticks(rotation=0)
    
    plt.subplot(1, 2, 2)
    plt.pie(df['Cancer'].value_counts().values, 
            labels=['No Cancer', 'Cancer'], 
            autopct='%1.1f%%', 
            colors=['lightblue', 'salmon'])
    plt.title('Class Distribution Percentage')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate class imbalance ratio
    cancer_count = df['Cancer'].sum()
    normal_count = len(df) - cancer_count
    imbalance_ratio = normal_count / cancer_count
    print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1 (Normal:Cancer)")