# 01 - Data Exploration

Exploratory Data Analysis (EDA) for PRISM project data.

## Objectives
- Load and inspect project data
- Understand data distributions
- Identify data quality issues
- Discover patterns and correlations

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

print("Libraries loaded successfully!")

In [None]:
# Load sample data
data_path = Path('../data/raw/sample_projects.csv')
df = pd.read_csv(data_path)

print(f"Loaded {len(df)} projects with {len(df.columns)} columns")
df.head()

In [None]:
# Data overview
print("=" * 50)
print("DATA OVERVIEW")
print("=" * 50)
print(f"\nShape: {df.shape}")
print(f"\nColumn types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

In [None]:
# Numerical summary
df.describe()

In [None]:
# Categorical distribution
categorical_cols = ['status', 'priority', 'risk_level', 'methodology']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].value_counts())

In [None]:
# TODO: Add visualization of distributions
# TODO: Add correlation analysis
# TODO: Add outlier detection