In [1]:
# Importing dependencies
import numpy as np
import pandas as pd
#from math import pi, sqrt
from os import path, listdir

## Load data:

In [2]:
# Data directories:

# On PC:
#exoplanets_dir = path.expanduser("~/OneDrive/SEPHI_data/exoplanets/")
#phase_space_dir = path.expanduser("~/OneDrive/phase_space_density_data/classification/dr3/")

# The first file is features_densities_gaiaedr3_6d_0_200000.csv
# The next file is 400,000
# Goes up in incraments of 200,000

#On my laptop:
exoplanets_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/exoplanets/")
phase_space_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/phase_space_density_data/classification/edr3/")

# The date that the exoplanets data (NASA EA) was downloaded:
data_date = "2022_04_28"

In [3]:
# Load exoplanets data:
exoplanets = pd.read_csv(path.join(exoplanets_dir, f"exoplanets_" + data_date + f".csv"))

In [4]:
def load_data(data_dir):
    """
    Takes in a data directory and sorts out which files contain 5d and 6d 
    information. Returns two lists of file names, 5d and 6d.
    """
    #list_5d = []
    list_6d = []
    for file in sorted(listdir(data_dir)):
        if "apg" in file:
            continue
        df = pd.read_csv(path.join(data_dir, file), index_col=0)    
        if "6d" in file:
            list_6d.append(df)
        df = pd.read_csv(path.join(data_dir, file), index_col=0)
        #elif "5d" in file:
            #list_5d.append(df)
                
    # TODO: what does concat do?
    #df_5d = pd.concat(list_5d, axis=0, ignore_index=True)
    df_6d = pd.concat(list_6d, axis=0, ignore_index=True)
    return df_6d
    #return df_5d, df_6d

In [5]:
df_6d = load_data( phase_space_dir )
#df_5d, df_6d = load_data( phase_space_dir )

new_cols = {"Host":"gaia_source_id", 
            "target_density":"psd", 
            "class":"class_6d", 
            "gm_p_low":"gm_p_low_6d", 
            "gm_p_high":"gm_p_high_6d", 
            "class":"class_6d"}
# TODO: what about planets with Pnull > 0 .05??

df_6d.rename(columns=new_cols, inplace=True)
# TODO: whyy, is it telling me the directory etc??

# I'm only using the 6d phase-space density reults for now, but if I wanted to 
#merge the 5d and 6d data frames, I would run the following:
#df = pd.merge(df_5d, df_6d[["Host", "class_6d", "gm_p_high_6d"]], on="Host")

# Crossmatch with the exoplanets data

In [6]:
# Check for repeated gaia ids in df_6D
duplicates = df_6d.duplicated(subset="gaia_source_id", keep='first')
i = np.where(duplicates == True)
print(i)
# There are no duplicated gaia ids in df_6D

(array([], dtype=int64),)


In [7]:
# Columns I am interested in:
# host i.e. gaia ID
# psd = phase space density
# gp_p_low_6d
# gm_p_high_6d
# psd_class (phase space density class)

exoplanets = pd.merge(exoplanets, df_6d[["gaia_source_id", "psd", "gm_p_low_6d", "gm_p_high_6d", "class_6d"]], on="gaia_source_id", how="left", indicator="EXOorPSD", validate="many_to_many")
#validate="many_to_one"
# TODO:
# error: Merge keys are not unique in right dataset; not a many-to-one merge

In [8]:
matched_psd = np.where(exoplanets["EXOorPSD"] == "both")
print("Planets in exoplanets were matched with", matched_psd[0].size, "stellar phase-space-densities.")

Planets in exoplanets were matched with 1639 stellar phase-space-densities.


# Saving the data:

In [9]:
exoplanets.to_csv( path.join( exoplanets_dir, f"exoplanets_psd_" + data_date + f".csv"), index=False )