In [1]:
import numpy as np
import pandas as pd
#from math import pi, sqrt
from os import path


In [2]:
# Data directories:
NEA_dir = "~/OneDrive/SEPHI_data/NASA_EA/"
DR2_dir = "~/OneDrive/SEPHI_data/Gaia/DR2/"
CKS_dir = "~/OneDrive/SEPHI_data/CKS/"
# NB: I don't have the CKS data saved yet

data_date = "2022_04_28"
#data_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/"

In [3]:
# Read the NASA Exoplanet Archive data:
NEA = pd.read_csv(path.join(NEA_dir, f"NASA_EA_processed_"+ data_date + ".csv")) 
print("Length of NEA:", NEA.shape[0])
print(NEA.head())

Length of NEA: 5014
      pl_name  hostname              gaia_designation  sy_snum  sy_pnum  \
0    11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1    11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2    14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3    14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4  16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   

   discoverymethod  NEA_pl_orbper  NEA_pl_sma  NEA_pl_rade  NEA_pl_bmasse  \
0  Radial Velocity      326.03000        1.29          NaN      6165.6000   
1  Radial Velocity      516.21997        1.53          NaN      4684.8142   
2  Radial Velocity      185.84000        0.83          NaN      1525.5000   
3  Radial Velocity     1773.40002        2.93          NaN      1481.0878   
4  Radial Velocity      798.50000        1.66          NaN       565.7374   

   ...       gaia_source_id    NEAc_lum  NEAc_lumerr1  NEAc_lumerr

In [4]:
# Read the Gaia DR2 data:
DR2 = pd.read_csv(path.join(DR2_dir, f"DR2_processed_" + data_date + ".csv"))
DR2_len = DR2.shape[0]
print("Length of Gaia DR2:", DR2_len)
print(DR2.head())

Length of Gaia DR2: 4196
        gaia_source_id  gdr2_st_teff  gdr2_st_tefferr2  gdr2_st_tefferr1  \
0  3946945413106333696     4755.0000          -75.0000            312.00   
1  1696798367260229376     4248.7000         -109.7000            262.06   
2  1920113512486282240     4740.0000          -58.9000            106.50   
3  2135550755683407232     5777.2500          -80.7500            112.75   
4  4342464209753404416     4296.6665          -91.6665             72.00   

   gdr2_st_rad  gdr2_st_raderr2  gdr2_st_raderr1  gdr2_st_lum  \
0    17.181000        -2.050695         0.555086   135.954530   
1    30.262005        -3.414095         1.625389   268.852720   
2    11.147492        -0.484541         0.282292    56.514830   
3     1.119800        -0.042461         0.031972     1.258521   
4    26.555468        -0.868109         1.170406   216.535550   

   gdr2_st_lumerr2  gdr2_st_lumerr1  
0        -3.180190          3.18019  
1        -5.003600          5.00360  
2        -0.6

In [5]:
# Read CKS
#CKS = pd.read_csv(path.join(DR2_dir, f"CKS_processed_" + data_date + ".csv"))
#print("Length of CKS:", CKS.shape[0])

In [6]:
## Crossmatching

In [13]:
# Merging the NASA EA and Gaia DR2 to start the exoplanets data frame:
exoplanets = pd.merge(NEA, DR2, on="gaia_source_id", how="left", indicator = "NEAorDR2", validate="many_to_one")
print(exoplanets.shape[0])
print(exoplanets)

# The indicies of stars in the NEA that are also in the DR2 df (now merged into the exoplanets df):
matched_DR2 = np.where(exoplanets["NEAorDR2"] == "both")
not_matched_DR2 = np.where(exoplanets["NEAorDR2"] == "right_only")
#print("Number of stars in DR2 df merged with NEA:", in_DR2[0].size, "out of", DR2_len , "stars in DR2 df." )
print("Stars in Gaia DR2 were matched with", matched_DR2[0].size, "exoplanets in the NEA.\nTotal number of stars in Gaia DR2 df:", DR2_len)
print("Number of unmatched stars from Gaia DR2 df:", not_matched_DR2[0].size)

5014
         pl_name  hostname              gaia_designation  sy_snum  sy_pnum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2       14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   
...          ...       ...                           ...      ...      ...   
5009   ups And b   ups And   Gaia DR2 348020448377061376        2        3   
5010   ups And c   ups And   Gaia DR2 348020448377061376        2        3   
5011   ups And d   ups And   Gaia DR2 348020448377061376        2        3   
5012   ups Leo b   ups Leo  Gaia DR2 3794167001116433152        1        1   
5013    xi Aql b    xi Aql  Gaia DR2 4298361114750843904        1        1   

      discoverymethod  NEA_pl_orbper  NEA_pl_sma  NEA_pl_r

In [8]:
# TODO: Read the CKS data
# TODO: Apply CKS code
# TODO: classify which parameters are the 'best'
# TODO: calculate SEPHI from best params

In [9]:
## Classifying the uncertainties

In [10]:
# Calculating the mean percentage error for each version of stellar lum:
NEA_comb_errs = combined_percent_errs(exoplanets["NEA_lum"], exoplanets["NEA_lumerr1"], exoplanets["NEA_lumerr2"] )
NEAc_comb_errs = combined_percent_errs(exoplanets["NEAc_lum"], exoplanets["NEAc_lumerr1"], exoplanets["NEAc_lumerr2"] )

# Calculating the mean percentage error for each version of stellar teff:


# Calculating the mean percentage error for each version of stellar age:


NameError: name 'combined_percent_errs' is not defined

In [None]:
# Calculating the percentage errors

# Luminosity:
lum_percentage_errs = pd.DataFrame(data=[NEA_comb_errs, NEAc_comb_errs]).transpose()
# TODO: Is there a way to do this without transposing?
print(lum_percentage_errs)

# Effective temp:
#teff_percentage_errs =

# Stellar age:
#age_percentage_errs =

In [None]:
# Find the location of the minimum value in each row

# Luminosity:
lum_flags = lum_percentage_errs.idxmin(axis=1, skipna=True)
# Returns the column name/index of the smallest value in each row
# With skipna=True, NaNs are ignored if there are real values in the row
# If all values in the row are NaN, then NaN is returned
print(lum_flags)

# Effective temp:
#teff_flags = teff_percentage_errs.idxmin(axis=1, skipna=True)

# Stellar age:
#age_flags = age_percentage_errs.idxmin(axis=1, skipna=True)

In [None]:
# TODO: add flags columns to exoplanets df
#exoplanets["lum_flags"] = lum_flags
#[["lum_flags", "teff_flags", "age_flags"]] 

# TODO: make arrays of the 'best' values using the flags