## Import Required Libraries

In [None]:
import sys
from pathlib import Path

try:
    import pandas as pd
    import numpy as np
except ImportError as e:
    print(f"Error: Required libraries not found: {e}")
    sys.exit(1)

try:
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.rcParams['figure.figsize'] = (20, 12)
    matplotlib.rcParams['font.size'] = 10
except ImportError as e:
    print(f"Error: Matplotlib not found: {e}")
    sys.exit(1)

print("✓ All libraries imported successfully")

## Load and Explore the Data

In [None]:
def load_dataset(filepath: str) -> pd.DataFrame:
    """
    Load a CSV dataset with proper error handling.
    
    Args:
        filepath: Path to the CSV file
        
    Returns:
        DataFrame containing the loaded data
        
    Raises:
        FileNotFoundError: If file doesn't exist
        pd.errors.ParserError: If file is not valid CSV
    """
    path = Path(filepath)
    
    if not path.exists():
        raise FileNotFoundError(f"Dataset not found: {filepath}")
    
    try:
        df = pd.read_csv(path)
        return df
    except pd.errors.ParserError as e:
        raise pd.errors.ParserError(f"Failed to parse CSV file: {e}")

try:
    train_df = load_dataset('../resources/Train_knight.csv')
    test_df = load_dataset('../resources/Test_knight.csv')
    print("✓ Datasets loaded successfully")
except FileNotFoundError as e:
    print(f"✗ Error: {e}")
    sys.exit(1)
except Exception as e:
    print(f"✗ Unexpected error loading datasets: {e}")
    sys.exit(1)

In [None]:
print("=" * 60)
print("TRAINING DATASET OVERVIEW")
print("=" * 60)
print(f"\nShape: {train_df.shape}")
print(f"\nColumns ({len(train_df.columns)}):")
print(train_df.columns.tolist())
print(f"\nData Types:")
print(train_df.dtypes)
print(f"\nMissing Values:")
print(train_df.isnull().sum())
print(f"\nClass Distribution:")
print(train_df['knight'].value_counts())
print(f"\nFirst 5 rows:")
print(train_df.head())

In [None]:
print("\n" + "=" * 60)
print("TEST DATASET OVERVIEW")
print("=" * 60)
print(f"\nShape: {test_df.shape}")
print(f"\nClass Distribution:")
print(test_df['knight'].value_counts())
print(f"\nFirst 5 rows:")
print(test_df.head())

## Data Validation

In [None]:
def validate_dataset(df: pd.DataFrame, dataset_name: str) -> bool:
    """
    Validate dataset integrity and consistency.
    
    Args:
        df: DataFrame to validate
        dataset_name: Name of the dataset for error messages
        
    Returns:
        True if valid, False otherwise
    """
    if df is None or df.empty:
        print(f"✗ {dataset_name}: Dataset is empty")
        return False
    
    if 'knight' not in df.columns:
        print(f"✗ {dataset_name}: Missing 'knight' target column")
        return False
    
    if df.isnull().any().any():
        print(f"⚠ {dataset_name}: Contains missing values")
    
    valid_classes = {'Jedi', 'Sith'}
    if not set(df['knight'].unique()).issubset(valid_classes):
        print(f"✗ {dataset_name}: Unknown knight class found")
        return False
    
    print(f"✓ {dataset_name}: Validation passed")
    return True

train_valid = validate_dataset(train_df, "Training Dataset")
test_valid = validate_dataset(test_df, "Test Dataset")

if not (train_valid and test_valid):
    print("✗ Dataset validation failed")
    sys.exit(1)

## Create Histograms

In [None]:
def create_histograms(df: pd.DataFrame, title: str = "Knight Skills Distribution") -> plt.Figure:
    """
    Create overlaid histograms for all features separated by knight class.
    
    Args:
        df: DataFrame containing the data
        title: Title for the figure
        
    Returns:
        matplotlib Figure object
        
    Raises:
        ValueError: If DataFrame is empty or invalid
    """
    if df is None or df.empty:
        raise ValueError("DataFrame is empty")
    
    jedi = df[df['knight'] == 'Jedi']
    sith = df[df['knight'] == 'Sith']
    
    if len(jedi) == 0 or len(sith) == 0:
        raise ValueError("One or more knight classes are missing from the data")
    
    features = [col for col in df.columns if col != 'knight']
    n_features = len(features)
    
    n_cols = 5
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
    axes = axes.flatten()
    
    for idx, feature in enumerate(features):
        ax = axes[idx]
        
        ax.hist(jedi[feature], bins=20, alpha=0.6, label='Jedi', color='#3498db')
        ax.hist(sith[feature], bins=20, alpha=0.6, label='Sith', color='#e74c3c')
        
        ax.set_xlabel(feature, fontsize=9)
        ax.set_ylabel('Frequency', fontsize=9)
        ax.set_title(feature, fontweight='bold', fontsize=10)
        ax.legend(loc='upper right', fontsize=8)
        ax.grid(alpha=0.3, linestyle='--')
    
    for idx in range(n_features, len(axes)):
        fig.delaxes(axes[idx])
    
    fig.suptitle(title, fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()
    
    return fig

In [None]:
try:
    fig_train = create_histograms(train_df, "Training Data: Knight Skills Distribution")
    plt.show()
    print("✓ Training histograms created successfully")
except ValueError as e:
    print(f"✗ Error creating training histograms: {e}")
    sys.exit(1)
except Exception as e:
    print(f"✗ Unexpected error: {e}")
    sys.exit(1)

In [None]:
try:
    fig_test = create_histograms(test_df, "Test Data: Knight Skills Distribution")
    plt.show()
    print("✓ Test histograms created successfully")
except ValueError as e:
    print(f"✗ Error creating test histograms: {e}")
    sys.exit(1)
except Exception as e:
    print(f"✗ Unexpected error: {e}")
    sys.exit(1)

## Save Visualizations

In [None]:
def save_figure(fig: plt.Figure, filename: str, dpi: int = 100) -> bool:
    """
    Save a matplotlib figure to file.
    
    Args:
        fig: Figure object to save
        filename: Output filename
        dpi: Resolution in dots per inch
        
    Returns:
        True if successful, False otherwise
    """
    try:
        fig.savefig(filename, dpi=dpi, bbox_inches='tight')
        print(f"✓ Saved: {filename}")
        return True
    except PermissionError:
        print(f"✗ Permission denied: Cannot write to {filename}")
        return False
    except Exception as e:
        print(f"✗ Error saving {filename}: {e}")
        return False

save_figure(fig_train, 'Histogram_train.png')
save_figure(fig_test, 'Histogram_test.png')
print("\n✓ All visualizations saved successfully")

## Summary

✓ **Completed Exercise 00: Histogram**

### What We Accomplished:
1. Loaded and validated training and test datasets
2. Explored data structure, shape, and class distribution
3. Created professional histograms for all 30 features
4. Visualized the distribution of each feature separated by knight class (Jedi vs Sith)
5. Saved high-resolution visualizations

### Key Insights:
- The histograms show how each skill varies between Jedi and Sith knights
- Features with clear separation between classes may be more predictive
- Some features show similar distributions across both classes

### Files Generated:
- `Histogram_train.png` - Histogram visualization for training data
- `Histogram_test.png` - Histogram visualization for test data