In [1]:
# Importing dependencies
import numpy as np
import pandas as pd
#from math import pi, sqrt
from os import path, listdir

## Load data:

In [2]:
# Data directories:

# On PC:
#exoplanets_dir = path.expanduser("~/OneDrive/SEPHI_data/exoplanets/")
#phase_space_dir = path.expanduser("~/OneDrive/phase_space_density_data/classification/dr3/")

# The first file is features_densities_gaiaedr3_6d_0_200000.csv
# The next file is 400,000
# Goes up in incraments of 200,000

#On my laptop:
exoplanets_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/exoplanets/")
phase_space_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/phase_space_density_data/classification/edr3/")

# The date that the exoplanets data (NASA EA) was downloaded:
data_date = "2022_04_28"

## Load exoplanets

In [3]:
# Load exoplanets data:
exoplanets = pd.read_csv(path.join(exoplanets_dir, f"exoplanets_" + data_date + f".csv"))

## Method for loading psd data:

In [4]:
def load_data(data_dir):
    """
    Takes in a data directory and sorts out which files contain 5d and 6d 
    information. Returns two lists of file names, 5d and 6d.
    """
    #list_5d = []
    list_6d = []
    for file in sorted(listdir(data_dir)):
        if "apg" in file:
            continue
        df = pd.read_csv(path.join(data_dir, file), index_col=0)    
        if "6d" in file:
            list_6d.append(df)
        df = pd.read_csv(path.join(data_dir, file), index_col=0)
        #elif "5d" in file:
            #list_5d.append(df)
                
    # TODO: what does concat do?
    #df_5d = pd.concat(list_5d, axis=0, ignore_index=True)
    df_6d = pd.concat(list_6d, axis=0, ignore_index=True)
    return df_6d
    #return df_5d, df_6d

## Load psd data:

In [5]:
df_6d = load_data( phase_space_dir )
#df_5d, df_6d = load_data( phase_space_dir )

new_cols = {"Host":"gaia_source_id", 
            "target_density":"psd", 
            "class":"class_6d", 
            "gm_p_low":"gm_p_low_6d", 
            "gm_p_high":"gm_p_high_6d", 
            "class":"class_6d"}
# TODO: what about planets with Pnull > 0 .05??

df_6d.rename(columns=new_cols, inplace=True)
# TODO: whyy, is it telling me the directory etc??

# I'm only using the 6d phase-space density reults for now, but if I wanted to 
# merge the 5d and 6d data frames, I would run the following:
#df = pd.merge(df_5d, df_6d[["Host", "class_6d", "gm_p_high_6d"]], on="Host", indicator="XXX")

In [17]:
# Info. on the psd data:
df_6d_len = df_6d.shape[0]
print("Length of 6d psd data:", df_6d_len)
print('\nInfo\n', df_6d.info())
print('\nDescribe\n:', df_6d.describe())

Length of 6d psd data: 1824654
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1824654 entries, 0 to 1824653
Data columns (total 17 columns):
 #   Column          Dtype  
---  ------          -----  
 0   gaia_source_id  int64  
 1   n_40pc_stars    int64  
 2   n_80pc_stars    int64  
 3   densities_mean  float64
 4   densities_std   float64
 5   densities_max   float64
 6   densities_min   float64
 7   psd             float64
 8   gm_p_low_6d     float64
 9   gm_p_high_6d    float64
 10  gm_mean_low     float64
 11  gm_mean_high    float64
 12  gm_cov_low      float64
 13  gm_cov_high     float64
 14  gm_aic          float64
 15  gm_bic          float64
 16  class_6d        int64  
dtypes: float64(13), int64(4)
memory usage: 236.7 MB

Info
 None

Describe
:        gaia_source_id  n_40pc_stars  n_80pc_stars  densities_mean  \
count    1.824654e+06  1.824654e+06  1.824654e+06    1.824654e+06   
mean     3.663105e+18  1.421200e+03  1.109246e+04    2.483017e-01   
std      2.035464e+18

# Crossmatch with the exoplanets data

In [6]:
# Check for repeated gaia ids in df_6D
duplicates = df_6d.duplicated(subset="gaia_source_id", keep='first')
i = np.where(duplicates == True)
print(i)
# There are no duplicated gaia ids in df_6D

(array([], dtype=int64),)


In [7]:
# Columns I am interested in:
# host i.e. gaia ID
# psd = phase space density
# gp_p_low_6d
# gm_p_high_6d
# psd_class (phase space density class)

exoplanets = pd.merge(exoplanets, df_6d[["gaia_source_id", "psd", "gm_p_low_6d", "gm_p_high_6d", "class_6d"]], on="gaia_source_id", how="left", indicator="EXOorPSD", validate="many_to_many")
# cannot validate because the merge is many to many (on gaia_source_id i.e. there are multiple of the same star)

## Merge results:

In [11]:
matched_psd = np.where(exoplanets["EXOorPSD"] == "both")[0]
in_exo_only = np.where(exoplanets["EXOorPSD"] == "left_only")[0]

print("Planets in exoplanets were matched with", matched_psd.size, "stellar phase-space-densities.")
print("Planets in exoplanets with no phase-space-density:", in_exo_only.size)
print("matched_psd.size + in_exo_only.size =", matched_psd.size + in_exo_only.size) 

Planets in exoplanets were matched with 1639 stellar phase-space-densities.
Planets in exoplanets with no phase-space-density: 3375
matched_psd.size + in_exo_only.size = 5014


In [10]:
print("Info:\n", exoplanets.info())
print("\nDescribe:\n", exoplanets.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5014 entries, 0 to 5013
Data columns (total 95 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   pl_name              5014 non-null   object  
 1   hostname             5014 non-null   object  
 2   gaia_designation     4828 non-null   object  
 3   NEA_sy_snum          5014 non-null   int64   
 4   NEA_sy_pnum          5014 non-null   int64   
 5   NEA_discoverymethod  5014 non-null   object  
 6   NEA_pl_period        4841 non-null   float64 
 7   NEA_pl_sma           2807 non-null   float64 
 8   NEA_pl_smaerr1       1996 non-null   float64 
 9   NEA_pl_smaerr2       1995 non-null   float64 
 10  NEA_pl_rad           3883 non-null   float64 
 11  NEA_pl_raderr1       3501 non-null   float64 
 12  NEA_pl_raderr2       3501 non-null   float64 
 13  NEA_pl_mass          2076 non-null   float64 
 14  NEA_pl_masserr1      1913 non-null   float64 
 15  NEA_pl_masserr2      

# Saving the data:

In [9]:
exoplanets.to_csv( path.join( exoplanets_dir, f"exoplanets_psd_" + data_date + f".csv"), index=False )