In [1]:
import numpy as np
import pandas as pd
from os import path

from calc_stellar_params import lum_eqn, lum_unc_eqn, teff_eqn, teff_unc_eqn

In [2]:
# Data directory:
#NEA_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/"
NEA_dir = "~/OneDrive/SEPHI_data/NASA_EA/"
header_len = 292 # for NASA_EA_2022_04_28.csv
rosetta_dir = "~/OneDrive/SEPHI_data/CKS/"
data_date = "2022_04_28"

#header_len = 104 # for NASA_EA_2022_04_27.csv
#header_len = 116 # for NASA_EA_2022_02_09.csv

In [3]:
# Read the NASA_EA data:
NEA_cols = ["pl_name", "hostname", "gaia_id", "sy_snum", "sy_pnum", "discoverymethod", "pl_orbper", "pl_orbsmax", "pl_rade", "pl_bmasse", "pl_dens", "pl_orbeccen", "pl_eqt", "st_teff", "st_tefferr1", "st_tefferr2", "st_rad", "st_raderr1", "st_raderr2", "st_mass", "st_met", "st_lum", "st_lumerr1", "st_lumerr2", "st_logg", "st_age", "st_ageerr1", "st_ageerr2"]
NEA = pd.read_csv( path.join(NEA_dir, f"NASA_EA_" + data_date + f".csv"), skiprows=header_len, 
                         usecols=NEA_cols)
# "pl_orbincl"
# pl_orbper = orbital period [days]
# pl_orbsmax = orbit semi-major axis [au]
# exculuded "st_spectype" from download due to csv formatting
# , "sy_dist", "sy_plx", "sy_gaiamag"

# The gaia_id in NEA is called the 'designation' in Gaia tables, so rename it:
# The gaia_designation is the Gaia + space + DR + no. + space + Gaia source_id
# e.g. Gaia DR2 3946945413106333696
NEA.rename(columns={ "gaia_id": "gaia_designation" }, inplace=True )

# Add a new column called 'gaia_source_id' which is listed in Gaia tables
# The source_id is the number in the designation, e.g. 3946945413106333696
c1 = np.where( NEA["gaia_designation"].notnull() )
gaia_source_id = np.empty(NEA["gaia_designation"].size).astype(int)
gaia_source_id[c1] = NEA["gaia_designation"].iloc[c1].str.rsplit(" ", n=1, expand=True)[1].astype("int64")
#print(gaia_source_id)
NEA["gaia_source_id"] = gaia_source_id

# Remove the space in 'hostname' (e.g. 11 Com -> 11Com):
#NEA["hostname"] = NEA["hostname"].str.replace(" ", "")

print("No. rows in NASA EA: ", NEA.shape[0])

No. rows in NASA EA:  5014


In [4]:
print(NEA)

         pl_name  hostname              gaia_designation  sy_snum  sy_pnum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2       14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   
...          ...       ...                           ...      ...      ...   
5009   ups And b   ups And   Gaia DR2 348020448377061376        2        3   
5010   ups And c   ups And   Gaia DR2 348020448377061376        2        3   
5011   ups And d   ups And   Gaia DR2 348020448377061376        2        3   
5012   ups Leo b   ups Leo  Gaia DR2 3794167001116433152        1        1   
5013    xi Aql b    xi Aql  Gaia DR2 4298361114750843904        1        1   

      discoverymethod    pl_orbper  pl_orbsmax  pl_rade   pl_bm

In [5]:
# Create a csv containing the non NaN gaia designations:
# (useful when using the designations to search for stars in Gaia)
#NEA["gaia_designation"][~NEA["gaia_designation"].isnull()].to_csv(path.join(NEA_dir, f"gaia_designations_" + data_date + f".csv"), index=False)
NEA["gaia_designation"][NEA["gaia_designation"].notnull()].to_csv(path.join(NEA_dir, f"gaia_designations_" + data_date + f".csv"), index=False)

In [6]:
# Convert luminosities and uncs from log(solar) to solar:
st_lum_sol = 10**NEA["st_lum"] # to undo a log, base^x

# Converting the uncertainties:
# y = 10^x, 
# dy/dx = 10^x * ln(10)
# y_err = dy/dx * x_err
# y_err = 10^x * ln(10) * x_err
# y_err = y * x_err * ln(10)
st_lum_solerr1 = np.multiply(10**NEA["st_lum"], NEA["st_lumerr1"]) * np.log(10)
st_lum_solerr2 = np.multiply(10**NEA["st_lum"], NEA["st_lumerr2"]) * np.log(10)
print(st_lum_sol)
print(st_lum_solerr1)
print(st_lum_solerr2)

# Comparing to the result from  upper unc = y + y_err - y = 10^(x-x_err) - 10^(x):
unc1 = 10**(NEA["st_lum"] + NEA["st_lumerr1"]) - 10**NEA["st_lum"]
unc2 = 10**(NEA["st_lum"] + NEA["st_lumerr2"]) - 10**NEA["st_lum"]
print(unc1)
print(unc2)

l = np.where( ( abs(unc1 - st_lum_solerr1) > 10 ) )
print(l)
print(st_lum_solerr1[l[0]])
print(unc1[l[0]])

# The two unc. calculation methods seem to yield similar results but not the same
# TODO: the check above doesn't seem to be working

0       174.984669
1              NaN
2        57.942870
3              NaN
4              NaN
           ...    
5009           NaN
5010           NaN
5011           NaN
5012     63.095734
5013     69.023980
Name: st_lum, Length: 5014, dtype: float64
0       28.607113
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
5009          NaN
5010          NaN
5011          NaN
5012    32.107609
5013          NaN
Length: 5014, dtype: float64
0      -34.247953
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
5009          NaN
5010          NaN
5011          NaN
5012    -7.990581
5013          NaN
Length: 5014, dtype: float64
0       31.078322
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
5009          NaN
5010          NaN
5011          NaN
5012    41.858508
5013          NaN
Length: 5014, dtype: float64
0      -31.104811
1             NaN
2             NaN
3            

In [7]:
# Replace in the table:
NEA["st_lum"] = st_lum_sol
NEA["st_lumerr1"] = st_lum_solerr1
NEA["st_lumerr2"] = st_lum_solerr2

In [8]:
## Calculating lum and unc where teff and rad are available

In [9]:
from math import pi, sqrt
from astropy.constants import sigma_sb, L_sun, R_sun
sigma = sigma_sb.value

In [10]:
# Empty arrays for luminosity and the +ve adn -ve uncs.:
calc_lum = np.empty(NEA.shape[0])
calc_lumerr1 = np.empty(NEA.shape[0])
calc_lumerr2 = np.empty(NEA.shape[0])

# Fill the arrays with NaN:
calc_lum[:] = np.nan
calc_lumerr1[:] = np.nan
calc_lumerr2[:] = np.nan

# Condition 1: indicies of exoplanets with stellar teff and rad listed:
c1 = np.where( np.isfinite(NEA["st_teff"]) &  np.isfinite(NEA["st_rad"]) )

# Condition 2: indicies of exoplanets with dT1 and dR1 listed:
c2 = np.where( np.isfinite(NEA["st_teff"]) & np.isfinite(NEA["st_tefferr1"]) & np.isfinite(NEA["st_rad"]) & np.isfinite(NEA["st_raderr1"]) )

# Condition 3: indicies of exoplanets with dT2 adn dR2 listed
c3 = np.where( np.isfinite(NEA["st_teff"]) & np.isfinite(NEA["st_tefferr2"]) & np.isfinite(NEA["st_rad"]) & np.isfinite(NEA["st_raderr2"]) )

In [11]:
# Calculate luminosities for indicies c1:
calc_lum[c1] = lum_eqn(NEA["st_teff"].iloc[c1], NEA["st_rad"].iloc[c1])

# Calculate +ve errors for indicies c2:
calc_lumerr1[c2] =  lum_unc_eqn(NEA["st_teff"].iloc[c2], NEA["st_tefferr1"].iloc[c2], NEA["st_rad"].iloc[c2], NEA["st_raderr1"].iloc[c2]) 

# Calculate -ve errors for indicies c3:
calc_lumerr2[c3] = lum_unc_eqn(NEA["st_teff"].iloc[c3], NEA["st_tefferr2"].iloc[c3], NEA["st_rad"].iloc[c3], NEA["st_raderr2"].iloc[c3], positive_unc=False) 

In [12]:
# Add the calculated luminosities and their errors to the NASA EA table:
NEA["NEAc_lum"] = calc_lum
NEA["NEAc_lumerr1"] = calc_lumerr1
NEA["NEAc_lumerr2"] = calc_lumerr2

In [13]:
#print(NEA)

In [14]:
# Empty arrays for teff and the +ve and -ve uncs.:
calc_teff = np.empty(NEA.shape[0])
calc_tefferr1 = np.empty(NEA.shape[0])
calc_tefferr2 = np.empty(NEA.shape[0])

# Fill the arrays with NaN:
calc_teff[:] = np.nan
calc_tefferr1[:] = np.nan
calc_tefferr2[:] = np.nan

# Condition 1: indicies of exoplanets with stellar lum and rad listed:
c1 = np.where( np.isfinite(NEA["st_lum"]) &  np.isfinite(NEA["st_rad"]) )
#print(c1[0].size)

# Condition 2: indicies of exoplanets with dL1 and dR1 listed:
c2 = np.where( np.isfinite(NEA["st_lum"]) & np.isfinite(NEA["st_lumerr1"]) & np.isfinite(NEA["st_rad"]) & np.isfinite(NEA["st_raderr1"]) )
#print(c2[0].size)

# Condition 3: indicies of exoplanets with dL2 adn dR2 listed
c3 = np.where( np.isfinite(NEA["st_lum"]) & np.isfinite(NEA["st_lumerr2"]) & np.isfinite(NEA["st_rad"]) & np.isfinite(NEA["st_raderr2"]) )
#print(c3[0].size)

In [15]:
# Calculate teff for indicies c1:
calc_teff[c1] = teff_eqn(NEA["st_lum"].iloc[c1], NEA["st_rad"].iloc[c1])
#print(calc_teff[c1].size)

# Calculate +ve errors for indicies c2:
calc_tefferr1[c2] =  teff_unc_eqn(NEA["st_lum"].iloc[c2], NEA["st_lumerr1"].iloc[c2], NEA["st_rad"].iloc[c2], NEA["st_raderr1"].iloc[c2]) 

# Calculate -ve errors for indicies c3:
calc_tefferr2[c3] = teff_unc_eqn(NEA["st_lum"].iloc[c3], NEA["st_lumerr2"].iloc[c3], NEA["st_rad"].iloc[c3], NEA["st_raderr2"].iloc[c3], positive_unc=False) 

In [16]:
# Add the calculated effective temperatures and their errors to the NASA EA table:
NEA["NEAc_teff"] = calc_teff
NEA["NEAc_tefferr1"] = calc_tefferr1
NEA["NEAc_tefferr2"] = calc_tefferr2

In [17]:
#print(NEA)

In [18]:
# Renaming some of the columns:
cols = ["pl_orbper", "pl_orbsmax", "pl_rade", "pl_bmasse", "pl_dens", "pl_orbeccen", "pl_eqt", "pl_orbincl", "st_teff", "st_tefferr1", "st_tefferr2", "st_rad", "st_raderr1", "st_raderr2", "st_mass", "st_met", "st_lum", "st_lumerr1", "st_lumerr2", "st_logg", "st_age", "st_ageerr1", "st_ageerr2"]
# The other columns are: "pl_name", "hostname", "gaia_id", "sy_snum", "sy_pnum", "discoverymethod",  

#new_cols = [i.replace("st_", "s") for i in NEA_cols ] # remove all the st_. Anything with the prefix NEA referrs to the star
#new_cols = [i.replace("pl_", "p") for i in NEA_cols ]
new_cols = [i.replace("orbsmax", "sma") for i in cols ]
#new_NEA_cols = [i.replace("err", "_err") for i in new_cols1 ]
new_cols = ["NEA_" + str(i) for i in new_cols] 
print(new_cols)
cols_dict = {cols[i]: new_cols[i] for i in range(len(cols))}

NEA.rename( columns=cols_dict, inplace=True )
#print(NEA)

['NEA_pl_orbper', 'NEA_pl_sma', 'NEA_pl_rade', 'NEA_pl_bmasse', 'NEA_pl_dens', 'NEA_pl_orbeccen', 'NEA_pl_eqt', 'NEA_pl_orbincl', 'NEA_st_teff', 'NEA_st_tefferr1', 'NEA_st_tefferr2', 'NEA_st_rad', 'NEA_st_raderr1', 'NEA_st_raderr2', 'NEA_st_mass', 'NEA_st_met', 'NEA_st_lum', 'NEA_st_lumerr1', 'NEA_st_lumerr2', 'NEA_st_logg', 'NEA_st_age', 'NEA_st_ageerr1', 'NEA_st_ageerr2']


In [19]:
## Crosmatching the NASA data with rosetta (to get the Kepler names and KOI names)

# Loading the rosetta database:
rosetta_cols = ["pl_name","pl_koi_name", "pl_kepler_name"]
rosetta = pd.read_csv(path.join(rosetta_dir, f"rosetta_" + data_date + ".csv"), usecols=rosetta_cols)
rosetta_len = rosetta["pl_name"].size
print(rosetta_len)
#print(rosetta)

# Trimming rosetta down so it only has the planet name (in NEA) and Kepler ID (in CKS):
rosetta2 = pd.DataFrame(data= [ rosetta["pl_name"], rosetta["pl_koi_name"] ] ).transpose()
#, rosetta["pl_kepler_name"] 
rosetta2_len = rosetta2["pl_name"].size
print(rosetta2_len)
#print(rosetta2)

2732
2732


In [20]:
# Merging the NEA and rosetta:
NEA = pd.merge(NEA, rosetta2, on="pl_name", how="left", indicator="NEAorRos", validate="one_to_one")
# indicator. bool or str, default False. If True, adds a column to the output DataFrame called “_merge” with information on the source of each row. 
# validate. str, optional. If specified, checks if merge is of specified type. “one_to_one” or “1:1”: check if merge keys are unique in both left and right datasets.
NEA_len = NEA["pl_name"].size
print(NEA_len)
#print(NEA)

# How many NEA planets were successfully matched with rosetta2?
# Use the _merge column (from indicator=True)
# The column will have a Categorical type with the value of “left_only” for observations whose merge key only appears in the left DataFrame, 
#“right_only” for observations whose merge key only appears in the right DataFrame, and “both” if the observation’s merge key is found in both DataFrames.

# The indicies of Kepler planets in the NEA:
in_rosetta2 = np.where(NEA["NEAorRos"] == "both")
print("Number of Kepler (rosetta) planets in the NEA:", in_rosetta2[0].size, "out of", rosetta2_len, "Kepler planets.")

# The indicies of Kepler planets in NEA that were matched with Kepler planets in rosetta:
matched_rosetta2 = np.where(NEA["NEAorRos"] == "both")
not_matched_rosetta2 = np.where(NEA["NEAorRos"] == "right_only")
print("Number of Kepler (rosetta2) planets in the NEA:", matched_rosetta2[0].size, "out of", rosetta2_len, "Kepler planets.")
print("Number of unmatched stars from Rosetta:", not_matched_rosetta2[0].size)

5014
Number of Kepler (rosetta) planets in the NEA: 2732 out of 2732 Kepler planets.
Number of Kepler (rosetta2) planets in the NEA: 2732 out of 2732 Kepler planets.
Number of unmatched stars from Rosetta: 0


In [22]:
# Saving the processed NEA df to a csv:
NEA.to_csv( path.join(NEA_dir, f"NASA_EA_processed_" + data_date + f".csv") , index=False)