In [2]:
import numpy as np
import pandas as pd
from math import pi, sqrt
from os import path

import astropy.table #import tables
from astropy import units as u
from astropy import constants as const

#from preprocessing.calc_stellar_params import calc_luminosity, calc_temp
from preprocessing.analyse_errs import classify_err
#from preprocessing.calc_sephi import get_sephi_RM17


In [3]:
# Exoplanet directory:
#NASAEA_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/NASA_EA_2022_02_09.csv"
NASAEA_dir = "~/OneDrive/SEPHI_data/NASA_EA_2022_02_09.csv"
NEA_header_length = 116 # The length of the header in the exoplanets csv file

# CKS directories:
#CKS_stars_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/CKSII_2017_stars.tsv"
CKS_stars_dir = "~/OneDrive/SEPHI_data/CKSII_2017_stars.tsv"
CKS_col_start = 100 # the row where containing the colum headers


# GAIA DR2 directory:
#dr2_directory = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/dr2-result.csv"
dr2_directory = "~/OneDrive/SEPHI_data/dr2-result.csv"

host_names_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/host_names.csv"

# Data directory:
data_dir = "~/OneDrive/SEPHI_data/"
#data_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/"



In [4]:
# Read the exoplanets data:
exoplanets = pd.read_csv(path.join(data_dir, f"exoplanets.csv")) 

# pl_orbper = orbital period [days]
# pl_orbsmax = orbit semi-major axis [au]
# exculuded "st_spectype" due to csv formatting

In [5]:
print(exoplanets.head())

      pl_name  hostname                   designation  sy_snum  sy_pnum  \
0    11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1    11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2    14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3    14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4  16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   

   discoverymethod  NEA_pl_orbper  NEA_pl_orbsmax  NEA_pl_rade  NEA_pl_bmasse  \
0  Radial Velocity      326.03000            1.29          NaN      6165.6000   
1  Radial Velocity      516.21997            1.53          NaN      4684.8142   
2  Radial Velocity      185.84000            0.83          NaN      1525.5000   
3  Radial Velocity     1773.40002            2.93          NaN      1481.0878   
4  Radial Velocity      798.50000            1.66          NaN       565.7374   

   ...  NEA_age  NEA_ageerr1  NEA_ageerr2            source_id

In [6]:
# Not many of the exoplanets have the stellar age listed. Working out how many have stellar age:
#print("Number of exoplanets with no stellar age listed: ", exoplanets["st_age"].isna().sum())
#print("Number of exoplanets with stellar age listed: ", exoplanets["st_age"].notna().sum())

In [7]:
# Not many have stellar luminosity listed (but this can be calculated):
#print("Number of exoplanets with no L listed: ", exoplanets["st_lum"].isna().sum())
#print("Number of exoplanets with L listed: ", exoplanets["st_lum"].notna().sum())

# Do more have their luminosities listed in Gaia?

In [8]:
print(exoplanets.shape[0])

4734


In [9]:
# Array to store classification:
"""
classes_L = np.zeros( (exoplanets.shape[0]), dtype=int )
#print(classes_L[0:40])

# Classify the luminosity uncertainties using the function:
for i in range(exoplanets.shape[0]):
    classes_L[i] = classify_err(exoplanets["st_lum"].iloc[i], exoplanets["st_lumerr1"].iloc[i], exoplanets["st_lumerr2"].iloc[i], exoplanets["calc_L%err1"].iloc[i], exoplanets["calc_L%err2"].iloc[i])
                                                                                                             
#print(classes_L[0:100])
#print(len(classes_L))
"""

'\nclasses_L = np.zeros( (exoplanets.shape[0]), dtype=int )\n#print(classes_L[0:40])\n\n# Classify the luminosity uncertainties using the function:\nfor i in range(exoplanets.shape[0]):\n    classes_L[i] = classify_err(exoplanets["st_lum"].iloc[i], exoplanets["st_lumerr1"].iloc[i], exoplanets["st_lumerr2"].iloc[i], exoplanets["calc_L%err1"].iloc[i], exoplanets["calc_L%err2"].iloc[i])\n                                                                                                             \n#print(classes_L[0:100])\n#print(len(classes_L))\n'

In [37]:
def calc_percent_errs(values, errs):
    """
    values = values array/df
    err = errors array/df
    
    returns percent_errs = df of percentage errors
    """
    
    values.to_numpy()
    errs.to_numpy()
    
    percent_errs =  np.absolute( np.multiply(errs, values**(-1)) ) * 100
    return percent_errs.to_numpy()

def combined_percent_errs(values, errs1, errs2):
    """
    values = values array/df
    errs1 = +ve errors array/df
    errs2 = -ve errors array/df
    
    returns a df of mean percentage errors
    """
    
    #values.to_numpy() These get converted to numpy in the percent_err function above
    #errs1.to_numpy()
    #errs2.to_numpy()
    
    means = ( calc_percent_errs(values, errs1) + calc_percent_errs(values, errs2) ) / 2
    return means

In [38]:
# TODO: write conditions:
# c1: both real values
# c2: both nan

# Empty array for % errs:
NEA_comb_errs = np.empty(exoplanets.shape[0])
NEAc_comb_errs = np.empty(exoplanets.shape[0])

# Condition 1: there are +ve and -ve errors for both NEA_lum and NEAc_lum
#c1 = np.where( np.isfinite(exoplanets["NEA_lumerr1"]) & np.isfinite(exoplanets["NEA_lumerr2"]) & np.isfinite(exoplanets["NEAc_lumerr1"] ) np.isfinite(exoplanets["NEAc_lumerr2"] ) )

NEA_percent_errs1 = calc_percent_errs(exoplanets["NEA_lum"], exoplanets["NEA_lumerr1"])
print(NEA_percent_errs1)
NEA_percent_errs2 = calc_percent_errs(exoplanets["NEA_lum"], exoplanets["NEA_lumerr2"])
print(NEA_percent_errs2)

blah = (NEA_percent_errs1 + NEA_percent_errs2)/2
print(blah)

# Calc the percentage errors
NEA_comb_errs = combined_percent_errs(exoplanets["NEA_lum"], exoplanets["NEA_lumerr1"], exoplanets["NEA_lumerr2"] )
NEAc_comb_errs = combined_percent_errs(exoplanets["NEAc_lum"], exoplanets["NEAc_lumerr1"], exoplanets["NEAc_lumerr2"] )

# Find which error is smaller, and assign a flag

# Empty array for classifications:
lum_flag = np.empty(exoplanets.shape[0])

# Condition 2: if NEA errs = NEAc errs
c2 = np.where(NEA_comb_errs == NEAc_comb_errs)
lum_flag[c2] = 0

# Condition 3: if NEA errs < NEAc errs
c3 = np.where(NEA_comb_errs < NEAc_comb_errs)
lum_flag[c3] = 1

# Condition 4: if NEA errs > NEAc errs:
c4 = np.where(NEA_comb_errs > NEAc_comb_errs)
lum_flag[c4] = 2

# Condition 5: both NEA errs NEAc errs NaN:
c5 = np.where(np.isnan(NEA_comb_errs) & np.isnan(NEAc_comb_errs))
lum_flag[c5] = 3





[16.34835416         nan         nan ...         nan 50.88713056
         nan]
[19.57197329         nan         nan ...         nan 12.66421801
         nan]
[17.96016373         nan         nan ...         nan 31.77567428
         nan]


In [None]:
#print("Number of exoplanets with errors in st_lum < calc_L or no dcalc_L: ", np.count_nonzero(classes_L == 1),"\nNB: there are 909 planets with st_lum listed.") 
#print("Number of exoplanets with errors in calc_L < st_lum or no dst_lum: ", np.count_nonzero(classes_L == 2))

In [None]:
# Add the error class column to the exoplanets data frame
#exoplanets[ "Lerr_class" ] = classes_L
#print(exoplanets)

In [None]:
# Estimate T for stars in 'exoplanets'
temps = np.zeros( (exoplanets.shape[0], 5) )
for i in range(exoplanets.shape[0]):
    temps[i] = calc_temp(exoplanets["st_rad"].iloc[i], exoplanets["st_raderr1"].iloc[i], exoplanets["st_raderr2"].iloc[i], exoplanets["st_lum"].iloc[i], exoplanets["st_lumerr1"].iloc[i], exoplanets["st_lumerr2"].iloc[i]) 

print(temps)

In [None]:
# Add calc_teff to exoplanets df:
exoplanets[ ["calc_T", "calc_Terr1", "calc_T%err1", "calc_Terr2", "calc_T%err2"] ] = temps
#print(exoplanets.head(10))
# Temps and errors look good!

In [None]:
# Compare the uncertainties in st_teff and calc_teff: 

# Array to store classification:
classes_T = np.zeros( (exoplanets.shape[0]), dtype=int )

for i in range(exoplanets.shape[0]):
    classes_T[i] = classify_err(exoplanets["st_teff"].iloc[i], exoplanets["st_tefferr1"].iloc[i], exoplanets["st_tefferr2"].iloc[i], exoplanets["calc_T%err1"].iloc[i], exoplanets["calc_T%err2"].iloc[i])
                                                                                                             
print(classes_T[0:100])
#print(len(classes_T))

In [None]:
# Counting how many st_teff have smaller error and how many temp have smaller error
#print("Number of exoplanets with errors in st_teff < calc_T or incomplete errors on calc_teff: ", np.count_nonzero(classes_T == 1),"\nNB: there are 4507 planets with st_teff listed.") 
#print("Number of exoplanets with errors in calc_T < st_teff or incomplete errors on st_teff: ", np.count_nonzero(classes_T == 2))

In [None]:
# Adding the classes_T array to the exoplanets df
exoplanets["Terr_class"] = classes_T
#print(exoplanets)

In [None]:
pl_mass = exoplanets["pl_bmasse"].to_numpy() * u.earthMass
pl_rad = exoplanets["pl_rade"].to_numpy() * u.earthRad
pl_a = exoplanets["pl_orbsmax"].to_numpy() * u.AU
#teff = exoplanets["st_teff"].to_numpy() * u.K
#lum = exoplanets["st_lum"].to_numpy() * u.dex(u.L_sun)
st_mass = exoplanets["st_mass"].to_numpy() * u.solMass
age = exoplanets["st_age"].to_numpy() * u.Gyr
print("planet mass: ", pl_mass, "\n data shape: ", pl_mass.shape)
      #, pl_rad, pl_ a)

In [None]:
from preprocessing.calc_sephi import get_sephi_RM17

In [None]:
#sephi = get_sephi_RM17(pl_mass, pl_rad, pl_a, teff, lum, st_mass, age, verbose=False)

In [None]:
#print(sephi)
#print(sephi.shape)

# Checking whether uncertainties are the right sign (+/-ve)
#print( np.where( exoplanets["calc_Terr1"] <0 ) )
#print( np.where( exoplanets["calc_Terr2"] >0 ) )
#print( (np.where( np.isnan( exoplanets["calc_Terr2"].head(100)) ) ) )

In [None]:
# Collating the 'best' values for stellar luminosity

# The conditions that determine which L value is the 'best':
c1= np.where( (exoplanets["Lerr_class"].to_numpy() == 0) | (exoplanets["Lerr_class"].to_numpy() == 1) ) # indicies for which st_lum has smaller uncertainties
#print(c1[0])
#print(c1[0].size)
c2 = np.where( (exoplanets["Lerr_class"].to_numpy() == 2 ) ) # indicies for which calc_T has smaller uncertainties
c3 = np.where( (exoplanets["Lerr_class"].to_numpy() == 3 ) ) # indioies where neither st_lum nor calc_T have uncertainties, or both st_lum and calc_T are NaN

# Empty array to store the 'best' luminosities:
lum = np.zeros(exoplanets.shape[0])
print(lum.size)

lum[c1] = exoplanets["st_lum"].iloc[c1] # st_lum has smaller uncertainties
lum[c2] = exoplanets["calc_L"].iloc[c2] # calc_L has smaller uncertainties
lum[c3] = exoplanets["calc_L"].iloc[c3] # these are all NaN

lum = lum * u.dex(u.L_sun)
print(lum)

# NB: lums and teffs that aren't NaN but class 3 do not have uncertainties. Are these reliable results?

In [None]:
# Collating the 'best' values for effective temperature

# The conditions that determine which teff value is the 'best':
c1= np.where( (exoplanets["Terr_class"].to_numpy() == 0) | (exoplanets["Terr_class"].to_numpy() == 1) )
#print(c1[0])
#print(c1[0].size)
c2 = np.where( (exoplanets["Terr_class"].to_numpy() == 2 ) )
c3 = np.where( (exoplanets["Terr_class"].to_numpy() == 3 ) )

# Empty array to store the 'best' luminosities:
teff = np.zeros(exoplanets.shape[0])
print(teff.size)

teff[c1] = exoplanets["st_teff"].iloc[c1] # st_teff has smaller uncertainties
teff[c2] = exoplanets["calc_T"].iloc[c2] # calc_T has smaller uncertainties
teff[c3] = exoplanets["calc_T"].iloc[c3] # these are all NaN

teff = teff * u.K
#print(teff)

In [None]:
sephi = get_sephi_RM17(pl_mass, pl_rad, pl_a, teff, lum, st_mass, age, verbose=False)

In [None]:
print(sephi)
c1 = np.where( (sephi != 0) & np.isfinite(sephi) )
print(c1[0].size)
print(sephi[c1])
# TODO: These SEPHI values are INCREDIBLY small, is calc_sephi.py okay?
# It seemed okay when I tested/adapted it, see how it does when I have more params available

In [None]:
exoplanets.rename(columns={ "gaia_id": "designation" }, inplace=True )

In [None]:
# TODO: match the dr2 stellar params with NASA EA

# Things form Michal's exoplanets_gaia_crossmatch:
exoplanets2 = exoplanets
#print(exoplanets2["designation"])
# The Gaia ID in exoplanets is the DR followed by the star source ID (the designation)

In [None]:
exoplanets2["source_id"] = exoplanets2["designation"].str.rsplit(" ", n=1, expand=True)[1].astype("int64")
# This is the gaia_id without the DR listed
#print(exoplanets2["source_id"])
# The same source_id as listed in Gaia

In [None]:
print(exoplanets2["hostname"])

In [None]:
exoplanets2.drop(["designation"], axis=1, inplace=True)
exoplanets2["Host"] = exoplanets2["hostname"].str.replace(" ", "")
#print(exoplanets2["Host"])
#exoplanets2.drop_duplicates(subset=["Host"], inplace=True)
# TODO: I don't know what to do about that line, I think I want duplicates (at least later on so that all planets are listed with their respective star, even if stars are repeated)

In [None]:
exoplanets[["Host"]].to_csv(host_names_dir, index=False) #create a csv containing the source_id and Host

In [None]:
dr2 = pd.read_csv(dr2_directory, usecols=["designation", "source_id", "teff_val", "teff_percentile_lower", "teff_percentile_upper", "radius_val", "radius_percentile_lower", "radius_percentile_upper", 
                                          "lum_val", "lum_percentile_lower", "lum_percentile_upper"])


In [None]:
print(dr2.shape[0])

In [None]:
exoplanets2 = pd.merge(exoplanets, dr2, on="source_id", how="left")

In [None]:
print(exoplanets2.shape[0])
# It looks like not all the exoplanet hosts are in dr2
# Some might only be in edr3 (in which case, we defo don't have GAIA L for them) 
# I don't want any removed

# TODO: [21/03/2021] - Are you sure that these are all of the exoplanet hosts in DR2? 
# List the Gaia IDs and search DR2 for those stars using the basic search website

In [None]:
#print(exoplanets2)

In [None]:
# Changing Teff errors to +/- uncertainties:
exoplanets2["teff_percentile_upper"] = exoplanets2["teff_percentile_upper"] - exoplanets2["teff_val"] # should come out +ve
exoplanets2["teff_percentile_lower"] = exoplanets2["teff_percentile_lower"] - exoplanets2["teff_val"] # should come out -ve
#exoplanets2.rename(columns={ "teff_percentile_upper": "gaia_tefferr1" }, inplace=True )
#exoplanets2.rename(columns={ "teff_percentile_lower": "gaia_tefferr2" }, inplace=True )

# Changing radius errors to +/- uncertainties:
gaia_raderr1 = exoplanets2["radius_percentile_upper"] - exoplanets2["radius_val"] # should come out +ve
gaia_raderr2 = exoplanets2["radius_percentile_lower"] - exoplanets2["radius_val"] # should come out -ve
#exoplanets2.rename(columns={ "radius_percentile_upper": "gaia_raderr1" }, inplace=True )
#exoplanets2.rename(columns={ "radius_percentile_lower": "gaia_raderr2" }, inplace=True )

# Changing luminosity errors to +/- uncertainties:
gaia_lumerr1 = exoplanets2["lum_percentile_upper"] - exoplanets2["lum_val"] # should come out +ve
gaia_lumerr2 = exoplanets2["lum_percentile_lower"] - exoplanets2["lum_val"] # should come out -ve
#exoplanets2.rename(columns={ "lum_percentile_upper": "gaia_lumerr1" }, inplace=True )
#exoplanets2.rename(columns={ "lum_percentile_lower": "gaia_lumerr2" }, inplace=True )

# TODO: Gaia lum is in solar units. Should I convert them or the Nasa EA units?

In [None]:
exoplanets2.rename(columns={ "teff_percentile_upper": "gaia_tefferr1" }, inplace=True )
exoplanets2.rename(columns={ "teff_percentile_lower": "gaia_tefferr2" }, inplace=True )

exoplanets2.rename(columns={ "radius_percentile_upper": "gaia_raderr1" }, inplace=True )
exoplanets2.rename(columns={ "radius_percentile_lower": "gaia_raderr2" }, inplace=True )

exoplanets2.rename(columns={ "lum_percentile_upper": "gaia_lumerr1" }, inplace=True )
exoplanets2.rename(columns={ "lum_percentile_lower": "gaia_lumerr2" }, inplace=True )

In [None]:
print(exoplanets2)

In [None]:
# TODO: Michal then does some stuff that I don't understand...
"""
#exoplanets.drop(["pl_name", "hostname"], axis=1, inplace=True)

gaia = gaia[~gaia["source_id"].isin(exoplanets["source_id"])]

gaia = gaia[4.5 < gaia["parallax"] / gaia["parallax_error"]] #removes stars with with parallax/parallax error > 4.5

# Concatenate exoplanet hosts back, however at the top of the dataframe. This way for testing purposes we later
# iterate only over first 1065 entries that are exoplanet hosts.
gaia = pd.concat([exoplanets2, gaia]) #adding the exoplanet list back into the gaia df, at the top

# Calculate distance in pc and drop any stars with negative or null distance
gaia["distance_pc"] = (1. / gaia["Fparallax"]) * 1000 #closely aligned sources are only occasionally resolved in Gaia, confusion in observation-to-source matching can lead to spurious parallax values which are either very large or have a negative value very far away from zero
gaia = gaia[gaia["distance_pc"] > 0] #returns all of gaia for which distance_pc > 0 and overwrites the gaia df with it. Gets rid of all entries where distance_pc <= 0. For these entries, the solution returned by gaia is unphysical so we want to ditch it
    
# Convert from degrees to pc
gaia["ra"] = (gaia["ra"] * np.pi) / 180.
gaia["dec"] = (gaia["dec"] * np.pi) / 180.
"""

In [None]:
# TODO: match with CKS stellar catalogue
# TODO: redo best param search (flag where CKS used)
# TODO: calc sephi for CKS planets only
# TODO: match with CKS planet params
# TODO: add the sephi to the data frame

In [None]:
cks_stars = pandas.read_csv(CKS_stars_dir, sep = ';', header = CKS_col_start, usecols=[ "Name", "Teff", "eTeff", "E_Teff", "Mass", "e_Mass", "e_Mass", "logA", "e_logA", "E_logA" ])
#"Npl", "logg", "e_logg", "E_logg", "[Fe/H]",
# Name = Star name: mostly Kepler Object Identifier (KOI), or Kepler Input Calalog (KIC) number
# Npl = number of planets in orbit
# logA (log_10(age)) [ucd=time.age] TODO: what does that last bit mean?

# TODO: match KOI with some other identifier in NASA EA or is KOI available in NASA EA?