##  EXPERIMENT USING STARS, EXOPLANETS AND QUASARS


In [25]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


## checking for missing/null values (extra caution before starting the new experiment)

In [26]:
# dealing with any missing/null values for stars, exopalnets and quasars datasets 
stars_df = pd.read_csv("../../data/full/stars_gaia_clean.csv")
print(stars_df.isnull().sum())
print(stars_df.info())

exo_df = pd.read_csv("../../data/full/exoplanets_clean.csv")
print(exo_df.isnull().sum())
print(exo_df.info())

quasars_df = pd.read_csv("../../data/full/quasars_gaia_clean.csv")
print(quasars_df.isnull().sum())
print(quasars_df.info())

source_id          0
ra                 0
dec                0
parallax           0
pmra               0
pmdec              0
phot_g_mean_mag    0
bp_rp              0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   source_id        50000 non-null  int64  
 1   ra               50000 non-null  float64
 2   dec              50000 non-null  float64
 3   parallax         50000 non-null  float64
 4   pmra             50000 non-null  float64
 5   pmdec            50000 non-null  float64
 6   phot_g_mean_mag  50000 non-null  float64
 7   bp_rp            50000 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 3.1 MB
None
rowid              0
pl_name            0
hostname           0
pl_letter          0
gaia_id            0
discoverymethod    0
disc_year          0
pl_orbper          0
pl_orbsmax         0
st_teff

## Loading Data 


In [27]:
# Laad datasets
stars_df = pd.read_csv("../../data/full/stars_gaia_clean.csv").assign(label="star")
exo_df = pd.read_csv("../../data/full/exoplanets_clean.csv").assign(label="exo_host")
quasars_df = pd.read_csv("../../data/full/quasars_gaia_clean.csv").assign(label="quasar")


#align column: exo use sy_plx while stars and qiuasars use parallax
exo_df = exo_df.rename(columns={"sy_plx":"parallax", "sy_pmra":"pmra", "sy_pmdec":"pmdec", "sy_gaiamag":"phot_g_mean_mag"})


# select shared features
features = ["ra", "dec", "parallax", "pmdec", "pmra", "phot_g_mean_mag"]

# Define dataset size for each datasets 
sizes = [10000, 20000, 37800]







In [28]:
exo_df = exo_df.dropna(subset=features)


In [29]:
from sklearn.model_selection import train_test_split

# Trim datasets to only have selected features and drop rows with missing values
stars_df = stars_df[features + ["label"]].dropna()
exo_df = exo_df[features +["label"]].dropna()
quasars_df = quasars_df[features + ["label"]].dropna()



# This satcks the three datasets toghether, so i can threat them as a single multiclass dataset
df = pd.concat([stars_df, exo_df, quasars_df])

# Split in X (Numeric Feature only) and y (Call labels (Star, Exoplanet, Quasar)

X = df[features]
y = df["label"]


# labeled-feature datasets only 
subsets = {}

## Creates a balanced subsets for each dataset size
for n_total in sizes:
    n_per = min(n_total // 3, len(stars_df), len(exo_df), len(quasars_df))

    
    stars_sub = stars_df.sample(n=n_per, random_state=42)
    exo_sub = exo_df.sample(n=n_per, random_state=42)
    quasars_sub = quasars_df.sample(n=n_per, random_state=42)

    df_sub = pd.concat([stars_sub, exo_sub, quasars_sub], ignore_index=True)
    X_n = df_sub[features]
    y_n = df_sub["label"]
    subsets[n_total] = (X_n, y_n)



## Baselien sanity (DummyClassifier)
### I have 3 kinds of objects: Stars, Exoplanets, Quasars. 
### See if the model does better than random guessing, so i make a "dummy" model that just guesses in a simple way, and check how bad it is.
### If tha main model beats this dummy, i know im doing real learning.

In [30]:
print(exo_df.columns)
print(exo_df[features].isna().mean().sort_values(ascending=False))
print("exo rows before dropna:", len(exo_df))



Index(['ra', 'dec', 'parallax', 'pmdec', 'pmra', 'phot_g_mean_mag', 'label'], dtype='object')
ra                 0.0
dec                0.0
parallax           0.0
pmdec              0.0
pmra               0.0
phot_g_mean_mag    0.0
dtype: float64
exo rows before dropna: 37871


In [31]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score

X_n, y_n = subsets[10000]

for n in sizes:
    X_n, y_n = subsets[n]

splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

accs, f1s = [], []
for train_idx, test_idx in splitter.split(X_n, y_n):
    clf = DummyClassifier(strategy="most_frequent", random_state=42)
    clf.fit(X_n.iloc[train_idx], y_n.iloc[train_idx])
    pred = clf.predict(X_n.iloc[test_idx])
    accs.append(accuracy_score(y_n.iloc[test_idx], pred))
    f1s.append(f1_score(y_n.iloc[test_idx], pred, average="macro"))



print("Dummy most_frequent:", np.mean(accs), np.mean(f1s))



Dummy most_frequent: 0.33333333333333337 0.16666666666666669


## Logistic regression, Random Forest and Multiplayer perception (Main supervised models)

In [33]:
## Main supervised models (LogReg, RF, MPL)



#align column: exo use sy_plx while stars and qiuasars use parallax
exo_df = exo_df.rename(columns={"sy_plx":"parallax", "sy_pmra":"pmra", "sy_pmdec":"pmdec", "sy_gaiamag":"phot_g_mean_mag"})


# select shared features
features = ["ra", "dec", "parallax", "pmdec", "pmra", "phot_g_mean_mag"]

# Define dataset size for each datasets 
sizes = [10000, 20000, 37800]

# Trim datasets to only have selected features and drop rows with missing values
stars = stars_df[features + ["label"]].dropna()
exo = exo_df[features +["label"]].dropna()
quasars = quasars_df[features + ["label"]].dropna()

def eval_size(n_total, n_splits=30):
    n_per = min(n_total // 3, len(exo), len(stars), len(quasars))
    stars_sub = stars.sample(n=n_per, random_state=42)
    exo_sub = exo.sample(n=n_per, random_state=42)
    quasars_sub = quasars.sample(n=n_per, random_state=42)

    df = pd.concat([exo_sub, stars_sub, quasars_sub], ignore_index=True)
    X = df[features].values
    y = df["label"].values

    splitter = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=42)
    accs, f1s = [], []

    clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000))
    for train_idx, test_idx in splitter.split(X, y):
        clf.fit(X[train_idx], y[train_idx])
        pred = clf.predict(X[test_idx])
        accs.append(accuracy_score(y[test_idx], pred))
        f1s.append(f1_score(y[test_idx], pred, average="macro"))

    return {
        "size": 3 * n_per,
        "acc_mean": np.mean(accs),
        "f1_mean": np.mean(f1s)
    }

for size in sizes:
    print(eval_size(size))

{'size': 9999, 'acc_mean': np.float64(0.9774833333333335), 'f1_mean': np.float64(0.9774779968974232)}
{'size': 19998, 'acc_mean': np.float64(0.9794500000000002), 'f1_mean': np.float64(0.9794462269210071)}
{'size': 37800, 'acc_mean': np.float64(0.9796869488536154), 'f1_mean': np.float64(0.9796832753779882)}
