# Trim exoplanet sample

Load the NASA sample CSV, keep a targeted set of columns, and write a trimmed file.

In [None]:
import pandas as pd

EXOPLANETS_PATH = "../data/exoplanets_sample.csv"
OUTPUT_PATH = "../data/exoplanets_trimmed.csv"

KEEP_COLS = [
    "rowid", "pl_name", "hostname", "pl_letter",
    "discoverymethod", "disc_year", "pl_orbper", "pl_orbsmax",
    "pl_rade", "pl_radj", "pl_masse", "pl_massj",
    "st_spectype", "st_teff", "st_rad", "st_mass",
    "sy_dist", "sy_plx", "ra", "dec",
]

df = pd.read_csv(EXOPLANETS_PATH, usecols=lambda c: c in KEEP_COLS)
df.to_csv(OUTPUT_PATH, index=False)

df.shape


In [None]:
df.head()


In [None]:
df.columns.tolist()


In [None]:
# Clean columns with heavy missingness and save a lean file
DROP_COLS = ['pl_masse', 'pl_massj', 'pl_rade', 'pl_radj', 'st_spectype']

# Drop sparse columns
clean = df.drop(columns=DROP_COLS).copy()

# Median-impute moderately missing numeric columns
for col in ['st_teff', 'st_rad', 'st_mass', 'pl_orbper', 'pl_orbsmax']:
    clean[col] = clean[col].fillna(clean[col].median())

# Drop any remaining NaNs
clean = clean.dropna()

# Save cleaned dataset
clean.to_csv('../data/exoplanets_clean.csv', index=False)
clean.shape
