In [None]:
# ======================================================
# üöÄ ExoHabitAI ‚Äî FULL SCIENTIFIC DATA EXPLORATION
# Single-file version (Notebook + Script Compatible)
# ======================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 120)

# ======================================================
# üìÇ CONFIG
# ======================================================

DATA_PATH = "data/raw/PS_2026.01.19_01.24.31.csv"
REPORT_DIR = "reports"

os.makedirs(REPORT_DIR, exist_ok=True)

print("üöÄ Starting ExoHabitAI Scientific EDA...")

# ======================================================
# ‚≠ê LOAD NASA DATASET (SAFE)
# ======================================================

df = pd.read_csv(
    DATA_PATH,
    comment="#",          # Skip NASA metadata rows
    engine="python",
    on_bad_lines="skip"
)

print("\n‚úÖ Dataset Loaded")
print("Shape:", df.shape)

# ======================================================
# ‚≠ê DATASET STRUCTURE
# ======================================================

print("\n===== DATASET OVERVIEW =====")
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

print("\nFirst 20 Columns:")
print(list(df.columns[:20]))

print("\nData Types Summary:")
print(df.dtypes.value_counts())

# ======================================================
# ‚≠ê FEATURE TYPES
# ======================================================

numeric_cols = df.select_dtypes(include="number").columns
categorical_cols = df.select_dtypes(include="object").columns

print("\nNumeric Features:", len(numeric_cols))
print("Categorical Features:", len(categorical_cols))

# ======================================================
# ‚≠ê MISSING VALUE ANALYSIS
# ======================================================

missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)

print("\nTop Missing Columns:")
print(missing_pct.head(20))

# Plot missing values
plt.figure(figsize=(12,5))
missing_pct.head(25).plot(kind="bar")
plt.title("Top Missing Features (%)")
plt.ylabel("Missing %")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(REPORT_DIR,"eda_missing_values.png"))
plt.close()

print("‚úÖ Missing value plot saved")

# ======================================================
# ‚≠ê NUMERIC STATISTICS
# ======================================================

print("\n===== NUMERIC SUMMARY =====")
print(df.describe(include="number").T.head(20))

# ======================================================
# ‚≠ê SCIENTIFIC FEATURE INSPECTION
# ======================================================

important_features = [
    "pl_rade",
    "pl_eqt",
    "pl_orbper",
    "st_teff",
    "st_mass",
    "st_rad"
]

print("\n===== IMPORTANT FEATURE STATS =====")

for col in important_features:
    if col in df.columns:
        print(f"\n=== {col} ===")
        print(df[col].describe())

# ======================================================
# ‚≠ê OUTLIER VISUALIZATION
# ======================================================

plt.figure(figsize=(12,4))

if "pl_rade" in df.columns:
    plt.subplot(1,2,1)
    df["pl_rade"].dropna().plot(kind="box")
    plt.title("Planet Radius Distribution")

if "pl_eqt" in df.columns:
    plt.subplot(1,2,2)
    df["pl_eqt"].dropna().plot(kind="box")
    plt.title("Equilibrium Temperature Distribution")

plt.tight_layout()
plt.savefig(os.path.join(REPORT_DIR,"eda_outliers.png"))
plt.close()

print("‚úÖ Outlier plots saved")

# ======================================================
# ‚≠ê CORRELATION HEATMAP
# ======================================================

numeric_df = df.select_dtypes(include="number")

if numeric_df.shape[1] > 0:
    corr = numeric_df.corr()

    plt.figure(figsize=(10,6))
    plt.imshow(corr, aspect="auto")
    plt.colorbar()
    plt.title("Numeric Feature Correlation Heatmap")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORT_DIR,"eda_correlation.png"))
    plt.close()

    print("‚úÖ Correlation heatmap saved")

# ======================================================
# ‚≠ê SAVE EDA SUMMARY TEXT REPORT
# ======================================================

summary_path = os.path.join(REPORT_DIR,"eda_summary.txt")

with open(summary_path,"w",encoding="utf-8") as f:
    f.write("===== EXOHABITAI SCIENTIFIC EDA SUMMARY =====\n")
    f.write(f"Rows: {df.shape[0]}\n")
    f.write(f"Columns: {df.shape[1]}\n\n")
    f.write("Top Missing Columns:\n")
    f.write(str(missing_pct.head(20)))
    f.write("\n\nNumeric Describe:\n")
    f.write(str(df.describe(include='number').T.head(20)))

print(f"‚úÖ EDA summary saved: {summary_path}")

# ======================================================
# ‚≠ê FINAL MESSAGE
# ======================================================

print("\nüéâ ExoHabitAI Scientific EDA Completed Successfully!")

In [None]:
df.describe(include="number").T

