## Importing dependencies

In [1]:
import numpy as np
import pandas as pd
from os import path
from math import pi, sqrt
from astropy.constants import sigma_sb, L_sun, R_sun
sigma = sigma_sb.value

from calc_stellar_params import lum_eqn, lum_unc_eqn, teff_eqn, teff_unc_eqn

## Data directories:

In [2]:
# Data directories:

# On PC:
#NEA_dir = path.expanduser("~/OneDrive/SEPHI_data/NASA_EA/")
#rosetta_dir = path.expanduser("~/OneDrive/SEPHI_data/CKS/")

# On my laptop:
NEA_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/NASA_EA/")
rosetta_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/CKS/")

header_len = 292 # for NASA_EA_2022_04_28.csv
data_date = "2022_04_28"

#header_len = 104 # for NASA_EA_2022_04_27.csv
#header_len = 116 # for NASA_EA_2022_02_09.csv

# Read the NASA Exoplanet Archive (NEA):

In [3]:
# Dictionary for all the columns in the NEA and the names I assign them for the exoplanets database:
NEA_cols_dict = {
    "pl_name": "pl_name",
    "hostname": "hostname",
    "gaia_id": "gaia_designation",
    "sy_snum": "sy_snum", # number of stars
    "sy_pnum": "sy_pnum", # number of planets
    "sy_mnum": "sy_mnum", # number of moons
    "discoverymethod": "discoverymethod",
    "pl_orbper": "pl_period", # [days]
    "pl_orbpererr1": "pl_perioderr1",
    "pl_orbpererr2": "pl_perioderr2",
    "pl_orbsmax": "pl_sma", # [AU]
    "pl_orbsmaxerr1": "pl_smaerr1", 
    "pl_orbsmaxerr2": "pl_smaerr2", 
    "pl_rade": "pl_rad", # [Earth radius]
    "pl_radeerr1": "pl_raderr1",
    "pl_radeerr2": "pl_raderr2",
    "pl_bmasse": "pl_mass", # [Earth mass]
    "pl_bmasseerr1": "pl_masserr1",
    "pl_bmasseerr2": "pl_masserr2",
    "pl_dens": "pl_dens", # [g/cm**3]
    "pl_denserr1": "pl_denserr1",
    "pl_denserr2": "pl_denserr2",
    "pl_orbeccen": "pl_e", # planet eccentricity, e [None]
    "pl_orbeccenerr1": "pl_eerr1",
    "pl_orbeccenerr2": "pl_eerr2",
    "pl_insol": "pl_insol", # planet insolation flux [Earth flux]
    "pl_insolerr1": "pl_insolerr1",
    "pl_insolerr2": "pl_insolerr2",
    "pl_eqt": "pl_teq", # planet equilibrium temp [K]
    "pl_eqterr1": "pl_teqerr1",
    "pl_eqterr2": "pl_teqerr2",
    "pl_orbincl": "pl_i", # planet inclination, i [deg]
    "pl_orbinclerr1": "pl_ierr1",
    "pl_orbinclerr2": "pl_ierr2",
    "pl_imppar": "pl_b", # planet impact parameter, b [None]
    "pl_impparerr1": "pl_berr1",
    "pl_impparerr2": "pl_berr2",
    "pl_trandep": "pl_trandep", # [%]
    "pl_trandeperr1": "pl_trandeperr1",
    "pl_trandeperr2": "pl_trandeperr2",
    "pl_trandur": "pl_trandur", # planet transit duration [hours]
    "pl_trandurerr1": "pl_trandurerr1",
    "pl_trandurerr2": "pl_trandurerr2",
    "pl_trueobliq": "pl_trueobliq", # planet's true obliquity (axial tilt) [deg]
    "pl_trueobliqerr1": "pl_trueobliqerr1",
    "pl_trueobliqerr2": "pl_trueobliqerr2",
    "st_spectype": "st_spectype", # stellar spectral type
    "st_teff": "st_teff", # [K]
    "st_tefferr1": "st_tefferr1",
    "st_tefferr2": "st_tefferr2",
    "st_rad": "st_rad", # [Solar radius]
    "st_raderr1": "st_raderr1",
    "st_raderr2": "st_raderr2",
    "st_mass": "st_mass", # [Solar mass]
    "st_masserr1": "st_masserr1",
    "st_masserr2": "st_masserr2",
    "st_met": "st_met", # [dex]
    "st_meterr1": "st_meterr1",
    "st_meterr2": "st_meterr2",
    "st_lum": "st_lum", # [log10(Solar lum) -> Solar lum]
    "st_lumerr1": "st_lumerr1",
    "st_lumerr2": "st_lumerr2",
    "st_logg": "st_logg", # [log10(cm/s**2)]
    "st_loggerr1": "st_loggerr1",
    "st_loggerr2": "st_loggerr2",
    "st_age": "st_age", #[Gyr]
    "st_ageerr1": "st_ageerr1",
    "st_ageerr2": "st_ageerr2",
    "st_dens": "st_dens", # [g/cm**3]
    "st_denserr1": "st_denserr1",
    "st_denserr2": "st_denserr2",
    "st_vsin": "st_vsin", # stellar rotational velocity [km/s]
    "st_vsinerr1": "st_vsinerr1",
    "st_vsinerr2": "st_vsinerr2",
    "st_rotp": "st_rotp", # stellar rotational period [days]
    "st_rotperr1": "st_rotperr1",
    "st_rotperr2": "st_rotperr2",
    "st_rv": "rv", # stellar radial velocity [km/s]
    "st_rverr1": "rverr1",
    "st_rverr2": "rverr2",
    "ra": "ra", # decimal
    "raerr1": "raerr1",
    "raerr2": "raerr2",
    "dec": "dec", #decimal
    "decerr1": "decerr1",
    "decerr2": "decerr2",
    # do I need galactic lattitude and longitude?
    "sy_pm": "pm", # [mas/year]
    "sy_pmerr1": "pmerr1",
    "sy_pmerr2": "pmerr2",
    "sy_pmra": "pmra", # [mas/year]
    "sy_pmraerr1": "pmraerr1",
    "sy_pmraerr2": "pmraerr2",
    "sy_pmdec": "pmdec", # [mas/year]
    "sy_pmdecerr1": "pmdecerr1",
    "sy_pmdecerr2": "pmdecerr2",
    "sy_dist": "distance", #distance to system [pc]
    "sy_disterr1": "distanceerr1",
    "sy_disterr2": "distanceerr2",
    "sy_plx": "parallax", # parallax [mas]
    "sy_plxerr1": "parallaxerr1",
    "sy_plxerr2": "parallaxerr2",
    "rowupdate": "rowupdate", # date of last update
    "pl_pubdate": "pl_pubdate", # planetary parameter reference publication date
    "releasedate": "releasedate" # release date. TODO: what's the difference between this and the previous?
}

# The names of all the columns available from the DR2 CSV:
#NEA_all_cols = list(NEA_cols_dict.keys()) # used to generate the below:
NEA_all_cols = ['pl_name', 'hostname', 'gaia_id', 'sy_snum', 'sy_pnum', 'sy_mnum', 'discoverymethod', 'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_orbsmax', 'pl_orbsmaxerr1', 'pl_orbsmaxerr2', 'pl_rade', 
                'pl_radeerr1', 'pl_radeerr2', 'pl_bmasse', 'pl_bmasseerr1', 'pl_bmasseerr2', 'pl_dens', 'pl_denserr1', 'pl_denserr2', 'pl_orbeccen', 'pl_orbeccenerr1', 'pl_orbeccenerr2', 'pl_insol', 'pl_insolerr1', 
                'pl_insolerr2', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2', 'pl_orbincl', 'pl_orbinclerr1', 'pl_orbinclerr2', 'pl_imppar', 'pl_impparerr1', 'pl_impparerr2', 'pl_trandep', 'pl_trandeperr1', 'pl_trandeperr2', 
                'pl_trandur', 'pl_trandurerr1', 'pl_trandurerr2', 'pl_trueobliq', 'pl_trueobliqerr1', 'pl_trueobliqerr2', 'st_spectype', 'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_rad', 'st_raderr1', 'st_raderr2', 
                'st_mass', 'st_masserr1', 'st_masserr2', 'st_met', 'st_meterr1', 'st_meterr2', 'st_lum', 'st_lumerr1', 'st_lumerr2', 'st_logg', 'st_loggerr1', 'st_loggerr2', 'st_age', 'st_ageerr1', 'st_ageerr2', 
                'st_dens', 'st_denserr1', 'st_denserr2', 'st_vsin', 'st_vsinerr1', 'st_vsinerr2', 'st_rotp', 'st_rotperr1', 'st_rotperr2', 'st_rv', 'st_rverr1', 'st_rverr2', 'ra', 'raerr1', 'raerr2', 'dec', 'decerr1', 
                'decerr2', 'sy_pm', 'sy_pmerr1', 'sy_pmerr2', 'sy_pmra', 'sy_pmraerr1', 'sy_pmraerr2', 'sy_pmdec', 'sy_pmdecerr1', 'sy_pmdecerr2', 'sy_dist', 'sy_disterr1', 'sy_disterr2', 'sy_plx', 'sy_plxerr1', 
                'sy_plxerr2', 'rowupdate', 'pl_pubdate', 'releasedate']
#print(NEA_all_cols)

# The new column names as I want them to appear in the exoplanets database:
#NEA_new_cols =  ["NEA_" + str(i) for i in NEA_cols_dict.values()] # used to generate the below (I removed the NEA_ from the first 3 manually):
NEA_new_cols = ['pl_name', 'hostname', 'gaia_designation', 'NEA_sy_snum', 'NEA_sy_pnum', 'NEA_sy_mnum', 'NEA_discoverymethod', 'NEA_pl_period', 'NEA_pl_perioderr1', 'NEA_pl_perioderr2', 
                'NEA_pl_sma', 'NEA_pl_smaerr1', 'NEA_pl_smaerr2', 'NEA_pl_rad', 'NEA_pl_raderr1', 'NEA_pl_raderr2', 'NEA_pl_mass', 'NEA_pl_masserr1', 'NEA_pl_masserr2', 'NEA_pl_dens', 'NEA_pl_denserr1', 
                'NEA_pl_denserr2', 'NEA_pl_e', 'NEA_pl_eerr1', 'NEA_pl_eerr2', 'NEA_pl_insol', 'NEA_pl_insolerr1', 'NEA_pl_insolerr2', 'NEA_pl_teq', 'NEA_pl_teqerr1', 'NEA_pl_teqerr2', 'NEA_pl_i', 'NEA_pl_ierr1', 
                'NEA_pl_ierr2', 'NEA_pl_b', 'NEA_pl_berr1', 'NEA_pl_berr2', 'NEA_pl_trandep', 'NEA_pl_trandeperr1', 'NEA_pl_trandeperr2', 'NEA_pl_trandur', 'NEA_pl_trandurerr1', 'NEA_pl_trandurerr2', 
                'NEA_pl_trueobliq', 'NEA_pl_trueobliqerr1', 'NEA_pl_trueobliqerr2', 'NEA_st_spectype', 'NEA_st_teff', 'NEA_st_tefferr1', 'NEA_st_tefferr2', 'NEA_st_rad', 'NEA_st_raderr1', 'NEA_st_raderr2', 
                'NEA_st_mass', 'NEA_st_masserr1', 'NEA_st_masserr2', 'NEA_st_met', 'NEA_st_meterr1', 'NEA_st_meterr2', 'NEA_st_lum', 'NEA_st_lumerr1', 'NEA_st_lumerr2', 'NEA_st_logg', 'NEA_st_loggerr1', 
                'NEA_st_loggerr2', 'NEA_st_age', 'NEA_st_ageerr1', 'NEA_st_ageerr2', 'NEA_st_dens', 'NEA_st_denserr1', 'NEA_st_denserr2', 'NEA_st_vsin', 'NEA_st_vsinerr1', 'NEA_st_vsinerr2', 'NEA_st_rotp', 
                'NEA_st_rotperr1', 'NEA_st_rotperr2', 'NEA_rv', 'NEA_rverr1', 'NEA_rverr2', 'NEA_ra', 'NEA_raerr1', 'NEA_raerr2', 'NEA_dec', 'NEA_decerr1', 'NEA_decerr2', 'NEA_pm', 'NEA_pmerr1', 'NEA_pmerr2', 
                'NEA_pmra', 'NEA_pmraerr1', 'NEA_pmraerr2', 'NEA_pmdec', 'NEA_pmdecerr1', 'NEA_pmdecerr2', 'NEA_distance', 'NEA_distanceerr1', 'NEA_distanceerr2', 'NEA_parallax', 'NEA_parallaxerr1', 
                'NEA_parallaxerr2', 'NEA_rowupdate', 'NEA_pl_pubdate', 'NEA_releasedate']
#print(NEA_new_cols)

# Dictionary of old names in NEA vs new names for exoplanets database:
NEA_cols_dict2 = {NEA_all_cols[i]: NEA_new_cols[i] for i in range(len(NEA_all_cols))}
#print(NEA_cols_dict2)

# The columns that you want to read in from the NEA CSV (it's big!):
NEA_cols = ["pl_name", "hostname", "gaia_id", "sy_snum", "sy_pnum", "discoverymethod", "pl_orbper", "pl_orbsmax", 'pl_orbsmaxerr1', 'pl_orbsmaxerr2', "pl_rade", 'pl_radeerr1', 'pl_radeerr2', "pl_bmasse", 'pl_bmasseerr1', 'pl_bmasseerr2', "pl_dens", "pl_orbeccen", 
            "pl_eqt", "st_teff", "st_tefferr1", "st_tefferr2", "st_rad", "st_raderr1", "st_raderr2", "st_mass", 'st_masserr1', 'st_masserr2', "st_met", 'st_meterr1', 'st_meterr2', "st_lum", "st_lumerr1", "st_lumerr2", "st_logg", "st_age", "st_ageerr1", "st_ageerr2"]
# "pl_orbincl"
# pl_orbper = orbital period [days]
# pl_orbsmax = orbit semi-major axis [au]
# exculuded "st_spectype" from download due to csv formatting
# , "sy_dist", "sy_plx", "sy_gaiamag"
# In Gaia, the "source_id" is the number in the designation, e.g. 3946945413106333696
# In Gaia, the "designation" is Gaia DR2 3946945413106333696

In [4]:
# Read the NASA_EA data:
NEA = pd.read_csv( path.join(NEA_dir, f"NASA_EA_" + data_date + f".csv"), skiprows=header_len, 
                         usecols=NEA_cols)

# Intital info on the table:
NEA_len = NEA.shape[0]
print("No. rows in NASA EA:", NEA_len)
print("\nInfo:\n", NEA.info())
print("\nDescribe:\n", NEA.describe())
# TODO: why aren't df.info() 'Non-Null Count' and df.describe() 'count' the same??

# Rename the columns:
NEA.rename( columns=NEA_cols_dict2, inplace=True )

# Add a new column called 'gaia_source_id' which is listed in Gaia tables
# The source_id is the number in the designation, e.g. 3946945413106333696
c1 = np.where( NEA["gaia_designation"].notnull() )
gaia_source_id = np.empty(NEA["gaia_designation"].size).astype(int)
gaia_source_id[c1] = NEA["gaia_designation"].iloc[c1].str.rsplit(" ", n=1, expand=True)[1].astype("int64")
#print(gaia_source_id)
NEA["gaia_source_id"] = gaia_source_id

print(NEA)

No. rows in NASA EA: 5014
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5014 entries, 0 to 5013
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pl_name          5014 non-null   object 
 1   hostname         5014 non-null   object 
 2   gaia_id          4828 non-null   object 
 3   sy_snum          5014 non-null   int64  
 4   sy_pnum          5014 non-null   int64  
 5   discoverymethod  5014 non-null   object 
 6   pl_orbper        4841 non-null   float64
 7   pl_orbsmax       2807 non-null   float64
 8   pl_orbsmaxerr1   1996 non-null   float64
 9   pl_orbsmaxerr2   1995 non-null   float64
 10  pl_rade          3883 non-null   float64
 11  pl_radeerr1      3501 non-null   float64
 12  pl_radeerr2      3501 non-null   float64
 13  pl_bmasse        2076 non-null   float64
 14  pl_bmasseerr1    1913 non-null   float64
 15  pl_bmasseerr2    1913 non-null   float64
 16  pl_dens          695 non-null    f

## Saving a list of the exoplanets whose hosts are in the Gaia database

In [5]:
# Create a csv containing the non NaN gaia designations:
# (useful when using the designations to search for stars in Gaia)
in_gaia = NEA["gaia_designation"][NEA["gaia_designation"].notnull()]
in_gaia_len = in_gaia.shape
print('Number of NEA stars with Gaia designations:', in_gaia_len)
# 4828 agress with NEA.info()

# Save to csv:
in_gaia.to_csv(path.join(NEA_dir, f"gaia_designations_" + data_date + f".csv"), index=False)

Number of NEA stars with Gaia designations: (4828,)


# Convert NEA luminosities from log(solar) to solar

In [6]:
# Convert luminosities and uncs from log(solar) to solar:
st_lum_sol = 10**NEA["NEA_st_lum"] # to undo a log, base^x

# Converting the uncertainties:
# y = 10^x, 
# dy/dx = 10^x * ln(10)
# y_err = dy/dx * x_err
# y_err = 10^x * ln(10) * x_err
# y_err = y * x_err * ln(10)
st_lum_solerr1 = np.multiply(10**NEA["NEA_st_lum"], NEA["NEA_st_lumerr1"]) * np.log(10)
st_lum_solerr2 = np.multiply(10**NEA["NEA_st_lum"], NEA["NEA_st_lumerr2"]) * np.log(10)
print(st_lum_sol)
print(st_lum_solerr1)
print(st_lum_solerr2)

# Comparing to the result from  upper unc = y + y_err - y = 10^(x-x_err) - 10^(x):
unc1 = 10**(NEA["NEA_st_lum"] + NEA["NEA_st_lumerr1"]) - 10**NEA["NEA_st_lum"]
unc2 = 10**(NEA["NEA_st_lum"] + NEA["NEA_st_lumerr2"]) - 10**NEA["NEA_st_lum"]
print(unc1)
print(unc2)

l = np.where( ( abs(unc1 - st_lum_solerr1) > 10 ) )
print(l)
print(st_lum_solerr1[l[0]])
print(unc1[l[0]])

# The two unc. calculation methods seem to yield similar results but not the same
# TODO: the check above doesn't seem to be workingr

0       174.984669
1              NaN
2        57.942870
3              NaN
4              NaN
           ...    
5009           NaN
5010           NaN
5011           NaN
5012     63.095734
5013     69.023980
Name: NEA_st_lum, Length: 5014, dtype: float64
0       28.607113
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
5009          NaN
5010          NaN
5011          NaN
5012    32.107609
5013          NaN
Length: 5014, dtype: float64
0      -34.247953
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
5009          NaN
5010          NaN
5011          NaN
5012    -7.990581
5013          NaN
Length: 5014, dtype: float64
0       31.078322
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
5009          NaN
5010          NaN
5011          NaN
5012    41.858508
5013          NaN
Length: 5014, dtype: float64
0      -31.104811
1             NaN
2             NaN
3        

In [7]:
# Replace in the table:
NEA["NEA_st_lum"] = st_lum_sol
NEA["NEA_st_lumerr1"] = st_lum_solerr1
NEA["NEA_st_lumerr2"] = st_lum_solerr2

# Calculating lum and unc where teff and rad are available

## Conditions for luminosity calculations

In [8]:
# Empty arrays for luminosity and the +ve and -ve uncs.:
calc_lum = np.empty(NEA.shape[0])
calc_lumerr1 = np.empty(NEA.shape[0])
calc_lumerr2 = np.empty(NEA.shape[0])

# Fill the arrays with NaN:
calc_lum[:] = np.nan
calc_lumerr1[:] = np.nan
calc_lumerr2[:] = np.nan

# Condition 1: indicies of exoplanets with stellar teff and rad listed:
c1 = np.where( np.isfinite(NEA["NEA_st_teff"]) &  np.isfinite(NEA["NEA_st_rad"]) )

# Condition 2: indicies of exoplanets with dT1 and dR1 listed:
c2 = np.where( np.isfinite(NEA["NEA_st_teff"]) & np.isfinite(NEA["NEA_st_tefferr1"]) & np.isfinite(NEA["NEA_st_rad"]) & np.isfinite(NEA["NEA_st_raderr1"]) )

# Condition 3: indicies of exoplanets with dT2 adn dR2 listed
c3 = np.where( np.isfinite(NEA["NEA_st_teff"]) & np.isfinite(NEA["NEA_st_tefferr2"]) & np.isfinite(NEA["NEA_st_rad"]) & np.isfinite(NEA["NEA_st_raderr2"]) )

## Calculating luminosities

In [9]:
# Calculate luminosities for indicies c1:
calc_lum[c1] = lum_eqn(NEA["NEA_st_teff"].iloc[c1], NEA["NEA_st_rad"].iloc[c1])

# Calculate +ve errors for indicies c2:
calc_lumerr1[c2] =  lum_unc_eqn(NEA["NEA_st_teff"].iloc[c2], NEA["NEA_st_tefferr1"].iloc[c2], NEA["NEA_st_rad"].iloc[c2], NEA["NEA_st_raderr1"].iloc[c2]) 

# Calculate -ve errors for indicies c3:
calc_lumerr2[c3] = lum_unc_eqn(NEA["NEA_st_teff"].iloc[c3], NEA["NEA_st_tefferr2"].iloc[c3], NEA["NEA_st_rad"].iloc[c3], NEA["NEA_st_raderr2"].iloc[c3], positive_unc=False) 

In [10]:
# Add the calculated luminosities and their errors to the NASA EA table:
NEA["NEAc_lum"] = calc_lum
NEA["NEAc_lumerr1"] = calc_lumerr1
NEA["NEAc_lumerr2"] = calc_lumerr2

## Conditions for effective temperature calculations

In [11]:
# Empty arrays for teff and the +ve and -ve uncs.:
calc_teff = np.empty(NEA.shape[0])
calc_tefferr1 = np.empty(NEA.shape[0])
calc_tefferr2 = np.empty(NEA.shape[0])

# Fill the arrays with NaN:
calc_teff[:] = np.nan
calc_tefferr1[:] = np.nan
calc_tefferr2[:] = np.nan

# Condition 1: indicies of exoplanets with stellar lum and rad listed:
c1 = np.where( np.isfinite(NEA["NEA_st_lum"]) &  np.isfinite(NEA["NEA_st_rad"]) )
#print(c1[0].size)

# Condition 2: indicies of exoplanets with dL1 and dR1 listed:
c2 = np.where( np.isfinite(NEA["NEA_st_lum"]) & np.isfinite(NEA["NEA_st_lumerr1"]) & np.isfinite(NEA["NEA_st_rad"]) & np.isfinite(NEA["NEA_st_raderr1"]) )
#print(c2[0].size)

# Condition 3: indicies of exoplanets with dL2 adn dR2 listed
c3 = np.where( np.isfinite(NEA["NEA_st_lum"]) & np.isfinite(NEA["NEA_st_lumerr2"]) & np.isfinite(NEA["NEA_st_rad"]) & np.isfinite(NEA["NEA_st_raderr2"]) )
#print(c3[0].size)

## Calculating effective temperature

In [12]:
# Calculate teff for indicies c1:
calc_teff[c1] = teff_eqn(NEA["NEA_st_lum"].iloc[c1], NEA["NEA_st_rad"].iloc[c1])
#print(calc_teff[c1].size)

# Calculate +ve errors for indicies c2:
calc_tefferr1[c2] =  teff_unc_eqn(NEA["NEA_st_lum"].iloc[c2], NEA["NEA_st_lumerr1"].iloc[c2], NEA["NEA_st_rad"].iloc[c2], NEA["NEA_st_raderr1"].iloc[c2]) 

# Calculate -ve errors for indicies c3:
calc_tefferr2[c3] = teff_unc_eqn(NEA["NEA_st_lum"].iloc[c3], NEA["NEA_st_lumerr2"].iloc[c3], NEA["NEA_st_rad"].iloc[c3], NEA["NEA_st_raderr2"].iloc[c3], positive_unc=False) 

In [13]:
# Add the calculated effective temperatures and their errors to the NASA EA table:
NEA["NEAc_teff"] = calc_teff
NEA["NEAc_tefferr1"] = calc_tefferr1
NEA["NEAc_tefferr2"] = calc_tefferr2

In [14]:
#print(NEA)

# Crosmatching the NASA data with rosetta (to get the Kepler names and KOI names)

## Loading the rosetta database

In [15]:
# I only need the planet name (in NEA) and Kepler ID (in CKS):
rosetta_cols = ["pl_name","pl_koi_name"] #, "pl_kepler_name"]

# Reading the csv file:
rosetta = pd.read_csv(path.join(rosetta_dir, f"rosetta_" + data_date + f".csv"), usecols=rosetta_cols) 
rosetta_len = rosetta["pl_name"].size
print('Length of rosetta:', rosetta_len)
print("\nInfo:\n", rosetta.info())
print("\nDescribe\n", rosetta.describe())
print(rosetta)

Length of rosetta: 2732
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2732 entries, 0 to 2731
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   pl_koi_name  2679 non-null   object
 1   pl_name      2732 non-null   object
dtypes: object(2)
memory usage: 42.8+ KB

Info:
 None

Describe
        pl_koi_name   pl_name
count         2679      2732
unique        2679      2732
top      K00001.01  TrES-2 b
freq             1         1
     pl_koi_name       pl_name
0      K00001.01      TrES-2 b
1      K00072.01   Kepler-10 b
2      K00072.02   Kepler-10 c
3      K00041.02  Kepler-100 b
4      K00041.01  Kepler-100 c
...          ...           ...
2727   K01881.01  Kepler-995 b
2728   K01882.01  Kepler-996 b
2729   K01883.01  Kepler-997 b
2730   K01885.01  Kepler-998 b
2731   K01886.01  Kepler-999 b

[2732 rows x 2 columns]


## Merging the NEA and rosetta

In [16]:
# Merging:
NEA = pd.merge(NEA, rosetta, on="pl_name", how="left", indicator="NEAorRos", validate="one_to_one")
# indicator. bool or str, default False. If True, adds a column to the output DataFrame called “_merge” with information on the source of each row. 
# validate. str, optional. If specified, checks if merge is of specified type. “one_to_one” or “1:1”: check if merge keys are unique in both left and right datasets.
NEA_len = NEA.shape[0]
#print(NEA)

# How many NEA planets were successfully matched with rosetta?
# Use the _merge column (from indicator=True)
# The column will have a Categorical type with the value of “left_only” for observations whose merge key only appears in the left DataFrame, 
#“right_only” for observations whose merge key only appears in the right DataFrame, and “both” if the observation’s merge key is found in both DataFrames.

# Results of the merge:
in_rosetta_NEA = np.where(NEA["NEAorRos"] == "both")[0]
in_NEA_only = np.where(NEA["NEAorRos"] == "left_only")[0]
#in_rosetta_only = np.where(NEA["NEAorRos"] == "right_only")[0] # redundant as how="left"

print('Length of NEA df post merge:', NEA_len)
print("Number of Kepler (rosetta) planets in the NEA:", in_rosetta_NEA.size, "out of", rosetta_len, "Kepler planets.")
print("Number of non-Kepler (rosetta) planets in the NEA:", in_NEA_only.size, "out of", NEA_len, "NEA planets.")
print('in_rosetta_NEA.size + in_NEA_only.size =', in_rosetta_NEA.size + in_NEA_only.size)
#print("Number of unmatched stars from Rosetta:", in_rosetta_only.size)
print("\nInfo:\n", NEA.info())
print("\nDescribe\n", NEA.describe())

Length of NEA df post merge: 5014
Number of Kepler (rosetta) planets in the NEA: 2732 out of 2732 Kepler planets.
Number of non-Kepler (rosetta) planets in the NEA: 2282 out of 5014 NEA planets.
in_rosetta_NEA.size + in_NEA_only.size = 5014
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5014 entries, 0 to 5013
Data columns (total 47 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   pl_name              5014 non-null   object  
 1   hostname             5014 non-null   object  
 2   gaia_designation     4828 non-null   object  
 3   NEA_sy_snum          5014 non-null   int64   
 4   NEA_sy_pnum          5014 non-null   int64   
 5   NEA_discoverymethod  5014 non-null   object  
 6   NEA_pl_period        4841 non-null   float64 
 7   NEA_pl_sma           2807 non-null   float64 
 8   NEA_pl_smaerr1       1996 non-null   float64 
 9   NEA_pl_smaerr2       1995 non-null   float64 
 10  NEA_pl_rad           3883 non-null 

# Saving the processed NEA df to a csv

In [17]:
# Saving to csv:
#NEA.to_csv( path.join(NEA_dir, f"NASA_EA_processed_" + data_date + f".csv") , index=False)