##  EXPERIMENT USING STARS, EXOPLANETS AND QUASARS


In [29]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


## checking for missing/null values (extra caution before starting the new experiment)

In [30]:
# dealing with any missing/null values for stars, exopalnets and quasars datasets 
stars_df = pd.read_csv("../../data/full/stars_gaia_clean.csv")
print(stars_df.isnull().sum())
print(stars_df.info())

exo_df = pd.read_csv("../../data/full/exoplanets_gaia_enriched.csv")
print(exo_df.isnull().sum())
print(exo_df.info())

quasars_df = pd.read_csv("../../data/full/quasars_gaia_clean.csv")
print(quasars_df.isnull().sum())
print(quasars_df.info())

source_id          0
ra                 0
dec                0
parallax           0
pmra               0
pmdec              0
phot_g_mean_mag    0
bp_rp              0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   source_id        50000 non-null  int64  
 1   ra               50000 non-null  float64
 2   dec              50000 non-null  float64
 3   parallax         50000 non-null  float64
 4   pmra             50000 non-null  float64
 5   pmdec            50000 non-null  float64
 6   phot_g_mean_mag  50000 non-null  float64
 7   bp_rp            50000 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 3.1 MB
None
rowid                  0
pl_name                0
hostname               0
pl_letter              0
gaia_id                0
discoverymethod        0
disc_year              0
pl_orbper            

## Loading Data 


In [31]:
# Laad datasets
stars_df = pd.read_csv("../../data/full/stars_gaia_clean.csv").assign(label="star")
exo_df = pd.read_csv("../../data/full/exoplanets_gaia_enriched.csv").assign(label="exo_host")
quasars_df = pd.read_csv("../../data/full/quasars_gaia_clean.csv").assign(label="quasar")


#align column: exo use sy_plx while stars and qiuasars use parallax
exo_df = exo_df.rename(columns={"sy_plx":"parallax", "sy_pmra":"pmra", "sy_pmdec":"pmdec", "sy_gaiamag":"phot_g_mean_mag"})


# select shared features
features = ["ra", "dec", "parallax", "pmdec", "pmra", "phot_g_mean_mag"]

# Define dataset size for each datasets 
sizes = [10000, 20000, 37800]







In [11]:
exo_df = exo_df.dropna(subset=features)


In [33]:
from sklearn.model_selection import train_test_split

# labling the datasets
stars_df = stars_df[features + ["label"]].dropna()
exo_df = exo_df[features +["label"]].dropna()
quasars_df = quasars_df[features + ["label"]].dropna()



# comnime into one multiclass dataframe 
df = pd.concat([stars_df, exo_df, quasars_df])

# Split in X (Feature only) and y (labels)

X = df[features]
y = df["label"]


def stratified_subset(X, y, n, seed = 42):
    if n >= len(y):
        return X.copy(), y.copy()
    X_sub, _, y_sub, _ = train_test_split(
        X, y,
        train_size=n,
        stratify=y,
        random_state=seed
    )
    return X_sub, y_sub

# labeled-feature datasets only 
subsets = {}
for n_total in sizes:
    n_per = min(n_total // 3, len(stars_df), len(exo_df), len(quasars_df))


    stars_sub = stars_df.sample(n=n_per, random_state=42)
    exo_sub = exo_df.sample(n=n_per, random_state=42)
    quasars_sub = quasars_df.sample(n=n_per, random_state=42)

    df_sub = pd.concat([stars_sub, exo_sub, quasars_sub], ignore_index=True)
    X_n = df_sub[features]
    y_n = df_sub["label"]
    subsets[n_total] = (X_n, y_n)

