In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Load science-ready dataset
df = pd.read_csv("exoplanets_science_ready.csv")

In [2]:
# -----------------------------
# Recreate analytical strata
# -----------------------------

HZ_LOWER = 0.25
HZ_UPPER = 1.75
ROCKY_THRESHOLD = 1.6

df = df.copy()

df["in_habitable_zone"] = (
    (df["insolation_flux_earth_flux"] >= HZ_LOWER) &
    (df["insolation_flux_earth_flux"] <= HZ_UPPER)
)

df["composition"] = np.where(
    df["planet_radius_earth_radius"] < ROCKY_THRESHOLD,
    "Rocky",
    "Gaseous"
)

# -----------------------------
# Helper function
# -----------------------------

def summarize_test(name, group1, group2, test_result):
    print(f"\n{name}")
    print("Group 1 n =", len(group1), "| Group 2 n =", len(group2))
    print("Statistic =", round(test_result.statistic, 4))
    print("p-value   =", round(test_result.pvalue, 6))

In [3]:
# -----------------------------
# Hypothesis 1: Rocky planets inside vs outside HZ (radius)
# -----------------------------

rocky = df[df["composition"] == "Rocky"]

rocky_in = rocky[rocky["in_habitable_zone"]]["planet_radius_earth_radius"].dropna()
rocky_out = rocky[~rocky["in_habitable_zone"]]["planet_radius_earth_radius"].dropna()

# Use Welch's t-test (unequal variance)
test1 = stats.ttest_ind(rocky_in, rocky_out, equal_var=False)
summarize_test("Rocky Planets: Radius Inside vs Outside HZ", rocky_in, rocky_out, test1)

# -----------------------------
# Hypothesis 2: Gaseous planets inside vs outside HZ (radius)
# -----------------------------

gas = df[df["composition"] == "Gaseous"]

gas_in = gas[gas["in_habitable_zone"]]["planet_radius_earth_radius"].dropna()
gas_out = gas[~gas["in_habitable_zone"]]["planet_radius_earth_radius"].dropna()

test2 = stats.ttest_ind(gas_in, gas_out, equal_var=False)
summarize_test("Gaseous Planets: Radius Inside vs Outside HZ", gas_in, gas_out, test2)

# -----------------------------
# Hypothesis 3: Composition vs Habitable Zone (Chi-squared)
# -----------------------------

contingency = pd.crosstab(df["composition"], df["in_habitable_zone"])

chi2, p, dof, expected = stats.chi2_contingency(contingency)

print("\nComposition vs Habitable Zone (Chi-squared)")
print(contingency)
print("Chi2 =", round(chi2, 4))
print("p-value =", round(p, 6))

# -----------------------------
# Hypothesis 4 (Optional): Equilibrium temperature inside vs outside HZ
# -----------------------------

hz_temp = df[df["in_habitable_zone"]]["equilibrium_temperature_k"].dropna()
nonhz_temp = df[~df["in_habitable_zone"]]["equilibrium_temperature_k"].dropna()

test4 = stats.ttest_ind(hz_temp, nonhz_temp, equal_var=False)
summarize_test("Equilibrium Temperature: HZ vs Non-HZ", hz_temp, nonhz_temp, test4)


Rocky Planets: Radius Inside vs Outside HZ
Group 1 n = 30 | Group 2 n = 1100
Statistic = -1.6037
p-value   = 0.11876

Gaseous Planets: Radius Inside vs Outside HZ
Group 1 n = 85 | Group 2 n = 4552
Statistic = -9.1655
p-value   = 0.0

Composition vs Habitable Zone (Chi-squared)
in_habitable_zone  False  True 
composition                    
Gaseous             4573     85
Rocky               1100     30
Chi2 = 2.8053
p-value = 0.093955

Equilibrium Temperature: HZ vs Non-HZ
Group 1 n = 106 | Group 2 n = 4197
Statistic = -82.5922
p-value   = 0.0
