In [None]:
# Standard libraries
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Custom modules
from config import Config
from src.data.data_loader import BrainTumorDataLoader
from utils.visualization import DataVisualization
from utils.helpers import set_random_seeds, setup_logging, validate_dataset_structure

# Set random seeds for reproducibility
set_random_seeds(Config.RANDOM_SEED)

# Setup logging
logger = setup_logging()

print("🧠 Brain Tumor Detection - Phase 1: Data Exploration")
print("=" * 60)

: 

In [None]:
# Display system information
from utils.helpers import get_system_info
system_info = get_system_info()

print("📊 System Information:")
for key, value in system_info.items():
    print(f"   {key}: {value}")

print("\n📁 Dataset Validation:")
dataset_valid = validate_dataset_structure(Config.DATASET_PATH)

if not dataset_valid:
    raise Exception("Dataset structure validation failed!")

In [None]:
# Initialize data loader
data_loader = BrainTumorDataLoader()

# Get comprehensive dataset overview
print("📈 Dataset Overview Analysis...")
overview = data_loader.get_dataset_overview()

print("\n🎯 DATASET STATISTICS:")
print("=" * 40)

# Training set statistics
print("\n📚 TRAINING SET:")
total_train = 0
for class_name in Config.CLASS_NAMES:
    count = overview['training'][class_name]['count']
    total_train += count
    print(f"   {class_name:12}: {count:4d} images")

print(f"   {'TOTAL':12}: {total_train:4d} images")

# Testing set statistics  
print("\n🧪 TESTING SET:")
total_test = 0
for class_name in Config.CLASS_NAMES:
    count = overview['testing'][class_name]['count']
    total_test += count
    print(f"   {class_name:12}: {count:4d} images")

print(f"   {'TOTAL':12}: {total_test:4d} images")

print(f"\n📊 OVERALL DATASET SIZE: {total_train + total_test} images")

In [None]:
# Initialize visualization
viz = DataVisualization()

print("📊 Class Distribution Analysis...")

# Create class distribution visualization
viz.plot_class_distribution(overview, save_path=Config.FIGURES_PATH / "class_distribution.png")

# Calculate class balance metrics
train_counts = [overview['training'][class_name]['count'] for class_name in Config.CLASS_NAMES]
test_counts = [overview['testing'][class_name]['count'] for class_name in Config.CLASS_NAMES]

# Class imbalance analysis
train_imbalance_ratio = max(train_counts) / min(train_counts)
test_imbalance_ratio = max(test_counts) / min(test_counts)

print(f"\n⚖️  CLASS BALANCE ANALYSIS:")
print(f"   Training Imbalance Ratio: {train_imbalance_ratio:.2f}")
print(f"   Testing Imbalance Ratio:  {test_imbalance_ratio:.2f}")

if train_imbalance_ratio > 2.0:
    print("   ⚠️  WARNING: Significant class imbalance detected!")
    print("   💡 Consider: Class weighting, SMOTE, or stratified sampling")
else:
    print("   ✅ Class distribution is reasonably balanced")

In [None]:
print("\n🖼️  Loading Sample Images...")

# Load sample images for visualization
sample_images = data_loader.load_sample_images(n_samples=5)

# Display sample images
viz.plot_sample_images(sample_images, save_path=Config.FIGURES_PATH / "sample_images.png")

In [None]:
print("\n🔍 Analyzing Image Properties...")

# Analyze training set properties
print("   Analyzing training set...")
train_analysis = data_loader.analyze_image_properties(split='training')

# Analyze testing set properties
print("   Analyzing testing set...")
test_analysis = data_loader.analyze_image_properties(split='testing')

# Combine datasets for comprehensive analysis
train_analysis['split'] = 'training'
test_analysis['split'] = 'testing'
combined_analysis = pd.concat([train_analysis, test_analysis], ignore_index=True)

print(f"\n📋 IMAGE PROPERTIES SUMMARY:")
print("=" * 50)

# Basic statistics
print("\n📐 DIMENSIONS:")
print(f"   Width  - Mean: {combined_analysis['width'].mean():.1f}, Std: {combined_analysis['width'].std():.1f}")
print(f"   Height - Mean: {combined_analysis['height'].mean():.1f}, Std: {combined_analysis['height'].std():.1f}")

print("\n💾 FILE SIZES:")
print(f"   Mean: {combined_analysis['file_size_kb'].mean():.1f} KB")
print(f"   Min:  {combined_analysis['file_size_kb'].min():.1f} KB")
print(f"   Max:  {combined_analysis['file_size_kb'].max():.1f} KB")

print("\n🎨 INTENSITY STATISTICS:")
print(f"   Mean Intensity - Mean: {combined_analysis['mean_intensity'].mean():.1f}")
print(f"   Std Intensity  - Mean: {combined_analysis['std_intensity'].mean():.1f}")

In [None]:
# Statistical analysis by class
print("\n📊 DETAILED ANALYSIS BY CLASS:")
print("=" * 60)

class_stats = combined_analysis.groupby('class').agg({
    'width': ['mean', 'std', 'min', 'max'],
    'height': ['mean', 'std', 'min', 'max'], 
    'file_size_kb': ['mean', 'std', 'min', 'max'],
    'mean_intensity': ['mean', 'std', 'min', 'max'],
    'std_intensity': ['mean', 'std', 'min', 'max']
}).round(2)

display(class_stats)

# Check for dimension consistency
unique_dimensions = combined_analysis[['width', 'height']].drop_duplicates()
print(f"\n📏 UNIQUE IMAGE DIMENSIONS: {len(unique_dimensions)}")

if len(unique_dimensions) > 1:
    print("⚠️  Multiple image dimensions detected:")
    print(unique_dimensions.value_counts())
    print("💡 Recommendation: Standardize all images to single dimension")
else:
    print("✅ All images have consistent dimensions")

In [None]:
print("\n📈 Creating Comprehensive Visualizations...")

# Create detailed property analysis plots
viz.plot_image_properties_analysis(combined_analysis, 
                                 save_path=Config.FIGURES_PATH / "image_properties_analysis.png")

In [None]:
print("\n🔍 DATA QUALITY ASSESSMENT:")
print("=" * 40)

# Check for potential issues
quality_issues = []

# 1. Check for extremely small or large files
size_threshold_low = combined_analysis['file_size_kb'].quantile(0.05)
size_threshold_high = combined_analysis['file_size_kb'].quantile(0.95)

small_files = combined_analysis[combined_analysis['file_size_kb'] < size_threshold_low]
large_files = combined_analysis[combined_analysis['file_size_kb'] > size_threshold_high]

if len(small_files) > 0:
    quality_issues.append(f"Found {len(small_files)} unusually small files")
if len(large_files) > 0:
    quality_issues.append(f"Found {len(large_files)} unusually large files")

# 2. Check for extreme aspect ratios
extreme_aspect = combined_analysis[
    (combined_analysis['aspect_ratio'] < 0.8) | 
    (combined_analysis['aspect_ratio'] > 1.2)
]

if len(extreme_aspect) > 0:
    quality_issues.append(f"Found {len(extreme_aspect)} images with extreme aspect ratios")

# 3. Check for very dark or bright images
dark_images = combined_analysis[combined_analysis['mean_intensity'] < 20]
bright_images = combined_analysis[combined_analysis['mean_intensity'] > 200]

if len(dark_images) > 0:
    quality_issues.append(f"Found {len(dark_images)} very dark images")
if len(bright_images) > 0:
    quality_issues.append(f"Found {len(bright_images)} very bright images")

# Display quality assessment results
if quality_issues:
    print("⚠️  POTENTIAL QUALITY ISSUES DETECTED:")
    for issue in quality_issues:
        print(f"   • {issue}")
    print("\n💡 RECOMMENDATIONS:")
    print("   • Review flagged images manually")
    print("   • Consider preprocessing to normalize intensity")
    print("   • Apply consistent resizing/cropping")
else:
    print("✅ No major quality issues detected")

In [None]:
print("\n🎛️  Creating Interactive Dashboard...")

# Create interactive Plotly dashboard
interactive_fig = viz.create_interactive_dashboard(combined_analysis)
interactive_fig.show()

# Save interactive plot
interactive_fig.write_html(str(Config.FIGURES_PATH / "interactive_dashboard.html"))

In [None]:
print("\n🎯 KEY INSIGHTS & RECOMMENDATIONS:")
print("=" * 50)

insights = []

# Class distribution insights
if train_imbalance_ratio > 2.0:
    insights.append("⚖️  ADDRESS CLASS IMBALANCE: Use class weights or data augmentation")

# Dimension insights
if len(unique_dimensions) > 1:
    insights.append("📏 STANDARDIZE DIMENSIONS: Resize all images to consistent size")

# File size insights
size_variation = combined_analysis['file_size_kb'].std() / combined_analysis['file_size_kb'].mean()
if size_variation > 0.5:
    insights.append("💾 HIGH FILE SIZE VARIATION: Consider compression standardization")

# Intensity insights
intensity_variation = combined_analysis.groupby('class')['mean_intensity'].std().mean()
if intensity_variation > 30:
    insights.append("🎨 HIGH INTENSITY VARIATION: Apply histogram equalization")

# General recommendations
insights.extend([
    "🔍 PREPROCESSING PIPELINE: Implement consistent preprocessing",
    "📊 DATA AUGMENTATION: Use rotation, flip, zoom to increase diversity",
    "🧠 FEATURE EXTRACTION: Consider K-means segmentation for region analysis",
    "⚡ MODEL STRATEGY: Ensemble approach with CNN + traditional ML"
])

for i, insight in enumerate(insights, 1):
    print(f"{i:2d}. {insight}")

In [None]:
print("\n💾 Exporting Analysis Results...")

# Save analysis results
results_summary = {
    'dataset_overview': overview,
    'total_images': total_train + total_test,
    'class_imbalance_ratio': {
        'training': train_imbalance_ratio,
        'testing': test_imbalance_ratio
    },
    'image_dimensions': {
        'unique_dimensions': len(unique_dimensions),
        'most_common_size': (
            int(combined_analysis['width'].mode()[0]), 
            int(combined_analysis['height'].mode()[0])
        )
    },
    'quality_issues': quality_issues,
    'recommendations': insights
}

# Save to JSON for later use
import json
with open(Config.REPORTS_PATH / 'phase1_analysis_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

# Save detailed analysis DataFrame
combined_analysis.to_csv(Config.REPORTS_PATH / 'detailed_image_analysis.csv', index=False)

print("✅ Analysis results exported successfully!")
print(f"   📁 Summary: {Config.REPORTS_PATH / 'phase1_analysis_summary.json'}")
print(f"   📊 Detailed data: {Config.REPORTS_PATH / 'detailed_image_analysis.csv'}")
print(f"   🖼️  Visualizations: {Config.FIGURES_PATH}")

print("\n🎉 PHASE 1 ANALYSIS COMPLETE!")
print("=" * 50)
print("📋 Next Steps:")
print("   1. Review generated visualizations and reports")
print("   2. Plan preprocessing strategy based on insights")
print("   3. Move to Phase 2: Preprocessing & Feature Engineering")