Import & Load Dataset

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('//content//PS_2025.12.01_05.00.30.csv', comment='#')
print("Dataset loaded:", df.shape)


Dataset loaded: (39119, 92)


Define Must-Have Features

In [28]:
MUST_HAVE_FEATURES = [
    'pl_name','hostname','pl_rade','pl_bmasse','pl_eqt',
    'pl_orbper','pl_orbsmax','st_spectype','st_teff','st_met','st_rad'
]

print("Must-have features:", MUST_HAVE_FEATURES)


Must-have features: ['pl_name', 'hostname', 'pl_rade', 'pl_bmasse', 'pl_eqt', 'pl_orbper', 'pl_orbsmax', 'st_spectype', 'st_teff', 'st_met', 'st_rad']


Add Derived Features (pl_density & st_luminosity)

In [29]:
df['pl_density'] = (df['pl_bmasse'] / (df['pl_rade'] ** 3)) * 5.514
T_sun = 5772
df['st_luminosity'] = (df['st_rad'] ** 2) * ((df['st_teff'] / T_sun) ** 4)

MUST_HAVE_FEATURES.extend(['pl_density', 'st_luminosity'])

print("Derived features added. New columns:", ['pl_density', 'st_luminosity'])


Derived features added. New columns: ['pl_density', 'st_luminosity']


In [30]:
admin_cols = [
    'default_flag','pl_controv_flag','soltype','pl_refname','st_refname',
    'sy_refname','rowupdate','pl_pubdate','releasedate','rastr','decstr'
]

drop_admin = [c for c in admin_cols if c in df.columns and c not in MUST_HAVE_FEATURES]
df = df.drop(columns=drop_admin)

print("Admin cols removed:", drop_admin)
print("Remaining columns:", df.shape[1])


Admin cols removed: ['default_flag', 'pl_controv_flag', 'soltype', 'pl_refname', 'st_refname', 'sy_refname', 'rowupdate', 'pl_pubdate', 'releasedate', 'rastr', 'decstr']
Remaining columns: 83


Drop Duplicate Unit Columns

In [31]:
dup_cols = []

if 'pl_radj' in df.columns:
    dup_cols += ['pl_radj','pl_radjerr1','pl_radjerr2','pl_radjlim']

if 'pl_bmassj' in df.columns:
    dup_cols += ['pl_bmassj','pl_bmassjerr1','pl_bmassjerr2','pl_bmassjlim']

dup_cols = [c for c in dup_cols if c in df.columns]
df = df.drop(columns=dup_cols)

print("Dropped duplicate unit columns:", dup_cols)


Dropped duplicate unit columns: ['pl_radj', 'pl_radjerr1', 'pl_radjerr2', 'pl_radjlim', 'pl_bmassj', 'pl_bmassjerr1', 'pl_bmassjerr2', 'pl_bmassjlim']


Drop >70% Missing Columns

In [32]:
missing_pct = df.isnull().mean() * 100
high_miss = missing_pct[missing_pct > 70].index.tolist()
high_miss = [c for c in high_miss if c not in MUST_HAVE_FEATURES]

df = df.drop(columns=high_miss)

print("Dropped high-missing columns:", high_miss)
print("Remaining:", df.shape[1])


Dropped high-missing columns: ['pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_bmasseerr1', 'pl_bmasseerr2', 'pl_bmasselim', 'pl_bmassprov', 'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_eqterr1', 'pl_eqterr2']
Remaining: 65


In [37]:
import pandas as pd

df = pd.read_csv("//content//exo_final_cleaned_dataset.csv")

# List all numeric columns to fill missing values with median
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    if df[col].isna().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

# List all categorical columns to fill missing values with mode
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    if df[col].isna().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

# Save the corrected file
df.to_csv("exo_final_cleaned_dataset_filled.csv", index=False)

print("Missing values fixed successfully!")
print("Remaining NULL count:", df.isna().sum().sum())


Missing values fixed successfully!
Remaining NULL count: 0


Variance Threshold Filter

In [33]:
num_cols = df.select_dtypes(include=[np.number]).columns
variances = df[num_cols].var()
low_var = [c for c in variances.index if variances[c] < 0.001 and c not in MUST_HAVE_FEATURES]

df = df.drop(columns=low_var)
num_cols = [c for c in num_cols if c not in low_var]

print("Dropped low variance features:", low_var)


Dropped low variance features: ['pl_orbperlim', 'pl_orbsmaxlim', 'pl_radelim', 'pl_insollim', 'pl_eqtlim', 'st_tefflim', 'st_radlim', 'st_masslim', 'st_metlim', 'st_logglim', 'sy_gaiamagerr1', 'sy_gaiamagerr2']


Pearson Correlation (>0.95)

In [34]:
corr = df[num_cols].corr().abs()
high_corr = set()

for i in range(len(corr.columns)):
    for j in range(i+1, len(corr.columns)):
        if corr.iloc[i,j] > 0.95:
            c1 = corr.columns[i]
            c2 = corr.columns[j]
            if c1 in MUST_HAVE_FEATURES:
                high_corr.add(c2)
            elif c2 in MUST_HAVE_FEATURES:
                high_corr.add(c1)
            else:
                high_corr.add(c2)

df = df.drop(columns=list(high_corr))
num_cols = [c for c in num_cols if c not in high_corr]

print("Highly correlated dropped:", high_corr)


Highly correlated dropped: {'st_masserr2', 'pl_orbsmax', 'pl_orbpererr2', 'sy_gaiamag', 'pl_orbpererr1'}


In [35]:
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    low = Q1 - 1.5 * IQR
    high = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=low, upper=high)

print("Outlier capping completed.")


Outlier capping completed.


In [36]:
df.to_csv("exo_final_cleaned_dataset.csv", index=False)
print("Final dataset saved. Shape:", df.shape)


Final dataset saved. Shape: (39119, 48)


ONE-HOT ENCODING

In [41]:
import pandas as pd


df = pd.read_csv("exo_final_cleaned_dataset_filled.csv")

print("üîç Dataset Loaded:", df.shape)


categorical_cols = ['st_spectype', 'discoverymethod']

print("\n Categorical columns selected for One-Hot Encoding:")
print(categorical_cols)

print("\n Unique value counts:")
for col in categorical_cols:
    print(f" - {col}: {df[col].nunique()} unique categories")


df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

print("\n Shape before encoding:", df.shape)
print(" Shape after encoding:", df_encoded.shape)


print("\n One-Hot Columns Added:")
added_cols = [c for c in df_encoded.columns if any(col + "_" in c for col in categorical_cols)]
for col in added_cols:
    print(" -", col)


df_encoded.to_csv("exo_final_encoded_dataset.csv", index=False)

print("\n Saved as: exo_final_encoded_dataset.csv")


üîç Dataset Loaded: (39119, 48)

 Categorical columns selected for One-Hot Encoding:
['st_spectype', 'discoverymethod']

 Unique value counts:
 - st_spectype: 289 unique categories
 - discoverymethod: 11 unique categories

 Shape before encoding: (39119, 48)
 Shape after encoding: (39119, 346)

 One-Hot Columns Added:
 - st_spectype_A
 - st_spectype_A0 V
 - st_spectype_A1 IV-V
 - st_spectype_A2
 - st_spectype_A2 V
 - st_spectype_A5
 - st_spectype_A5 V
 - st_spectype_A5-A6
 - st_spectype_A7 V
 - st_spectype_A8
 - st_spectype_A8 III
 - st_spectype_A8 V
 - st_spectype_A8Ve
 - st_spectype_A9/F0
 - st_spectype_Am C
 - st_spectype_B
 - st_spectype_B2 IV
 - st_spectype_B2.5 V
 - st_spectype_B9 IV
 - st_spectype_B9 V
 - st_spectype_B9 Vne
 - st_spectype_B9.5-A0
 - st_spectype_Be9.5/AO
 - st_spectype_DC
 - st_spectype_DQ
 - st_spectype_F
 - st_spectype_F V
 - st_spectype_F0
 - st_spectype_F0 IV
 - st_spectype_F1 V
 - st_spectype_F2
 - st_spectype_F2 V
 - st_spectype_F3
 - st_spectype_F3 V
 - s

In [44]:
import pandas as pd
import re

df = pd.read_csv("exo_final_cleaned_dataset_filled.csv")

def extract_spectral(stype):
    if pd.isna(stype):
        return pd.Series([None, None, None])
    match = re.match(r"([OBAFGKM])\s*([0-9.]*)\s*([IV]{1,3}|V|III|IV|II|I)?", str(stype))
    if match:
        return pd.Series([match.group(1), match.group(2), match.group(3)])
    return pd.Series([None, None, None])

df[['spectral_class', 'spectral_subclass', 'luminosity_class']] = df['st_spectype'].apply(extract_spectral)

df['spectral_subclass'] = pd.to_numeric(df['spectral_subclass'], errors='coerce').fillna(df['spectral_subclass'].mode()[0])

df = df.drop(columns=['st_spectype'])

categorical_cols = ['spectral_class', 'luminosity_class', 'discoverymethod']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

df_encoded.to_csv("exo_final_encoded_dataset.csv", index=False)

print("Encoded shape:", df_encoded.shape)
print("Saved as exo_final_encoded_dataset.csv")


Encoded shape: (39119, 69)
Saved as exo_final_encoded_dataset.csv
