## Dependencies:

In [9]:
import numpy as np
import pandas as pd
#from math import pi, sqrt
from os import path
#from analyse_errs import calc_percent_errs, combined_percent_errs
from analyse_errs2 import calc_percent_errs, mean_percent_errs

## Data directories:

In [10]:
# On PC:
#NEA_dir = path.expanduser("~/OneDrive/SEPHI_data/NASA_EA/")
#DR2_dir = path.expanduser("~/OneDrive/SEPHI_data/Gaia/DR2/")
#CKS_dir = path.expanduser("~/OneDrive/SEPHI_data/CKS/")
#exoplanets_dir = path.expanduser("~/OneDrive/SEPHI_data/exoplanets/")

# On my laptop:
NEA_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/NASA_EA/")
DR2_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/Gaia/DR2/")
CKS_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/CKS/")
exoplanets_dir = path.expanduser("~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/exoplanets/")

data_date = "2022_04_28"

# Loading data:

In [11]:
# Read the NASA Exoplanet Archive data:
NEA = pd.read_csv(path.join(NEA_dir, f"NASA_EA_processed_"+ data_date + ".csv")) 
print("Length of NEA:", NEA.shape[0])
#print(NEA.head())

Length of NEA: 5014


In [12]:
# Read the Gaia DR2 data:
DR2 = pd.read_csv(path.join(DR2_dir, f"DR2_processed_" + data_date + ".csv"))
DR2_len = DR2.shape[0]
print("Length of Gaia DR2:", DR2_len)
#print(DR2.head())

Length of Gaia DR2: 4196


In [13]:
# Read in CKS data:
CKS = pd.read_csv(path.join(CKS_dir, f"CKS_processed_"+ data_date + ".csv")) 
CKS_len = CKS.shape[0]
print("Length of CKS:", CKS_len)
#print(CKS.head())

Length of CKS: 2025


In [14]:
# Read in Rosetta (the file with planet name, KOI name, and Kepler name):
rosetta = pd.read_csv(path.join(CKS_dir, f"rosetta_" + data_date + ".csv"))
rosetta_len = rosetta["pl_name"].size
print("Length of rosetta:", rosetta_len)

# Trimming rosetta down so it only has the planet name (in NEA) and Kepler ID (in CKS):
rosetta2 = pd.DataFrame(data= [ rosetta["pl_name"], rosetta["pl_koi_name"] ] ).transpose()
#, rosetta["pl_kepler_name"] 
rosetta2_len = rosetta2["pl_name"].size
print("Length of rosetta2:", rosetta2_len)
#print(rosetta2)

Length of rosetta: 2732
Length of rosetta2: 2732


# Crossmatching

In [15]:
# Merging the NASA EA and Gaia DR2 to start the exoplanets data frame:
exoplanets = pd.merge(NEA, DR2, on="gaia_source_id", how="left", indicator = "NEAorDR2", validate="many_to_one")
print("Length of exoplanets:", exoplanets.shape[0])
#print(exoplanets)

# The indicies of stars in the NEA that are also in the DR2 df (now merged into the exoplanets df):
matched_DR2 = np.where(exoplanets["NEAorDR2"] == "both")
#not_matched_DR2 = np.where(exoplanets["NEAorDR2"] == "right_only")
#print("Number of stars in DR2 df merged with NEA:", in_DR2[0].size, "out of", DR2_len , "stars in DR2 df." )
print("Stars in Gaia DR2 were matched with", matched_DR2[0].size, "exoplanets in the NEA.\nTotal number of stars in Gaia DR2 df:", DR2_len)
#print("Number of unmatched stars from Gaia DR2 df:", not_matched_DR2[0].size)
# TODO: the no. unmatched isn'ta ccurate

Length of exoplanets: 5014
Stars in Gaia DR2 were matched with 4514 exoplanets in the NEA.
Total number of stars in Gaia DR2 df: 4196


In [16]:
# Merging exoplanets with CKS to start the exoplanets data frame:
exoplanets = pd.merge(exoplanets, CKS, on="pl_koi_name", how="left", indicator = "EXOorCKS")
print(exoplanets.shape[0])
print(exoplanets)

# The indicies of stars in the NEA that are also in the DR2 df (now merged into the exoplanets df):
matched_CKS = np.where(exoplanets["EXOorCKS"] == "both")
#not_matched_CKS = np.where(exoplanets["EXOorCKS"] == "right_only")
print("Number of CKS planets crossmatched with the exoplanets df (i.e. the NEA atm):", matched_CKS[0].size, "out of", CKS_len, "Kepler planets in CKS.")
#print("Number of unmatched stars from CKS:", not_matched_CKS[0].size)

5014
         pl_name  hostname              gaia_designation  NEA_sy_snum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696            2   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376            1   
2       14 And b    14 And  Gaia DR2 1920113512486282240            1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504            1   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232            3   
...          ...       ...                           ...          ...   
5009   ups And b   ups And   Gaia DR2 348020448377061376            2   
5010   ups And c   ups And   Gaia DR2 348020448377061376            2   
5011   ups And d   ups And   Gaia DR2 348020448377061376            2   
5012   ups Leo b   ups Leo  Gaia DR2 3794167001116433152            1   
5013    xi Aql b    xi Aql  Gaia DR2 4298361114750843904            1   

      NEA_sy_pnum NEA_discoverymethod  NEA_pl_period  NEA_pl_sma  \
0               1     Radial Velocity     326.0300

In [17]:
# Why does it say 0 
#unmatched stars from CKS?
# ^ I'm getting rid of the instances where the entry is only in the 
#right-hand column
# some of the CKS planets are unconfirmed! the NEA only containes 
#confirmed planets
# I do not delete the unconfirmed planets from CKS because the status 
#isn't up to date. There are 1298 confirmed and 464 not dispositioned
# planets in CKS 2022_04_28 (numbers in CKS_processing)

# Classifying the uncertainties:

In [18]:
## Classifying the uncertainties

# TODO: classify which parameters are the 'best'
# TODO: calculate SEPHI from best params
# pl_mass, pl_rad, pl_sma, st_teff, st_lum, st_mass, st_age, st_met (st_met less important atm)
# do st_mass first, as a tester
# then st_age

exo_cols = list(exoplanets)
parameters = ['st_mass', 'st_age', 'st_lum', 'st_teff', 'st_met', 'pl_mass', 'pl_rad', 'pl_sma']
catalogues = ['NEA_', 'GDR_', 'Q16_', 'CKSI_', 'CKSII_'] # TODO: I could loop over these strings to look over the column headers
#print(cols)

In [19]:
print(len(exoplanets))

5014


In [None]:
# Method for flagging best values:
def flag_best_values(df, param, catalogues):
    """
    df = data frame
    param = the parameter
    catalogues = list of catalogue names e.g. 'NEA_'
    """
    
    means = pd.DataFrame()
    
    for l in catalogues:
        
        param_cols = []
        string = l + param
        exo_cols = list(df)
        
        for j in exo_cols:
            
            if j.__contains__(string):
                
                param_cols.append(j)
                
        if len(param_cols) > 0:
            
            means[string] = mean_percent_errs(df[param_cols])
            
            #c1 = np.where(np.isfinite(means[string]))
            #print("no. of finite means for", string, c1[0].size)
            
    flags = means.idxmin(axis=1, skipna=True)
    # Returns the column name/index of the smallest value in each row
    # With skipna=True, NaNs are ignored if there are real values in the row
    # If all values in the row are NaN, then NaN is returned
    #c2 = np.where(~flags.isnull())[0]
    #print('flags:\n', flags)
    #print(len(flags))
    #print('no. not null flags:', c2.size)
    
    return flags

In [None]:
# Looping over all parameters
all_flags2 = pd.DataFrame()

for i in parameters:
    flags = flag_best_values(exoplanets, 'st_mass', catalogues)
    all_flags2[i] = flags
    
print(all_flags2)

          st_mass       st_age       st_lum      st_teff       st_met  \
0     NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass   
1     NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass   
2     NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass   
3     NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass   
4     NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass   
...           ...          ...          ...          ...          ...   
5009          NaN          NaN          NaN          NaN          NaN   
5010          NaN          NaN          NaN          NaN          NaN   
5011          NaN          NaN          NaN          NaN          NaN   
5012  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass  NEA_st_mass   
5013          NaN          NaN          NaN          NaN          NaN   

          pl_mass       pl_rad       pl_sma  
0     NEA_st_mass  NEA_st_mass  NEA_st_mass  
1     NEA_st_mass  NEA_st_mass 

In [25]:
# Rename the best cols
#best_cols = [str(i) + '_best' for i in parameters] # generates the below
#print(best_cols)
best_cols = ['st_mass_best', 'st_age_best', 'st_lum_best', 'st_teff_best', 'st_met_best', 'pl_mass_best', 'pl_rad_best', 'pl_sma_best']
best_cols_dict = {parameters[i]: best_cols[i] for i in range(len(parameters))}
all_flags.rename(columns=best_cols_dict, inplace=True)

# Adding the all_flags df to exoplanets:
exoplanets = pd.concat([exoplanets, all_flags], axis=1)

print(exoplanets)

         pl_name  hostname              gaia_designation  NEA_sy_snum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696            2   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376            1   
2       14 And b    14 And  Gaia DR2 1920113512486282240            1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504            1   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232            3   
...          ...       ...                           ...          ...   
5009   ups And b   ups And   Gaia DR2 348020448377061376            2   
5010   ups And c   ups And   Gaia DR2 348020448377061376            2   
5011   ups And d   ups And   Gaia DR2 348020448377061376            2   
5012   ups Leo b   ups Leo  Gaia DR2 3794167001116433152            1   
5013    xi Aql b    xi Aql  Gaia DR2 4298361114750843904            1   

      NEA_sy_pnum NEA_discoverymethod  NEA_pl_period  NEA_pl_sma  \
0               1     Radial Velocity     326.030000   

In [37]:
# Method for selecting the best parameter:

def get_best_values(df, param):
    
    param = param + '_best'
    best_values = np.zeros(len(df), dtype=np.float64)
    
    for m in range(len(df)):
        if df[param].iloc[m] != df[param].iloc[m]:
            
            best_values[m] = np.nan
            
        else:
            best_values[m] = df[ df[param].iloc[m] ].iloc[m]
            
    return best_values

In [38]:
masses = get_best_values(exoplanets, 'st_mass')

In [39]:
print(masses)

[2.7  2.78 2.2  ...  nan 1.48  nan]


In [42]:
# Looping over all parameters

best_values_all2 = pd.DataFrame()
for i in parameters:
    best_values = get_best_values(exoplanets, i)
    best_values_all2[i] = best_values 
    
print(best_values_all2)

      st_mass  st_age      st_lum  st_teff  st_met     pl_mass  pl_rad  \
0        2.70     NaN  174.984669   4742.0  -0.350  6165.60000     NaN   
1        2.78     NaN         NaN   4213.0     NaN  4684.81420     NaN   
2        2.20     NaN         NaN   4813.0  -0.240         NaN     NaN   
3        0.90     NaN         NaN   5338.0     NaN  1481.08780     NaN   
4        1.08     NaN         NaN   5750.0     NaN   565.73740     NaN   
...       ...     ...         ...      ...     ...         ...     ...   
5009      NaN     NaN         NaN      NaN     NaN   218.53100     NaN   
5010      NaN     NaN         NaN      NaN     NaN   629.59500     NaN   
5011      NaN     NaN         NaN      NaN     NaN  1313.22000     NaN   
5012     1.48     NaN   63.095734      NaN  -0.200   162.09249     NaN   
5013      NaN     NaN         NaN   4780.0  -0.205         NaN     NaN   

        pl_sma  
0     1.290000  
1     1.530000  
2          NaN  
3     2.930000  
4     1.660000  
...      

In [12]:
# The original alltogether method: generates flags and best_values:

best_values_all = pd.DataFrame()
all_flags = pd.DataFrame()
for i in parameters:
    means = pd.DataFrame()
    for l in catalogues:
        
        param_cols = []
        string = l + i # e.g. 'NEA_' + 'st_mass'
        #print(string)
        #print(type(string))
        
        for j in exo_cols:
            if j.__contains__(string):
                # add string to param+cat_cols
                param_cols.append(j)
        #print(param_cols) # [value, err1, err2]
        #print(len(param_cols))
        
        if len(param_cols) > 0:
            
            #print(exoplanets[param_cols])
            means[string] = mean_percent_errs(exoplanets[param_cols])
        
            c1 = np.where(np.isfinite(means[string]))
            print("no. of finite means for", string, c1[0].size)
            
    #print('means:\n', means)
        
    flags = means.idxmin(axis=1, skipna=True)
    c2 = np.where(~flags.isnull())[0]
    #print('flags:\n', flags)
    #print(len(flags))
    #print('no. not null flags:', c2.size)
    #print(c2[10])
        
    all_flags[i] = flags

    best_values = [] # TODO: change this to np.zeros(len(exoplanets)) in order to use proper floats
        
    for m in range(len(flags)):
        #print('m', m)
        #print('flags[m]', flags[m])
        #print('iloc', flags.iloc[m])
        
        if flags[m] != flags[m]: # NaN != NaN
            best_values.append(np.nan) # TODO: this isn't working
        else:
            best_values.append(exoplanets[flags[m]].iloc[m]) #some of the flags are nan
            # for st_mass, the best values are being shown to a couple of dp
            # TODO: change to: best_values[m] = exoplanets[flags[m]].iloc[m]
        
    #print(best_values)
    best_values_all[i] = best_values # TODO: you will want to add
    
print('all flags:\n', all_flags)
print('best_values all:\n', best_values_all)
    


no. of finite means for NEA_st_mass 3941
no. of finite means for CKSII_st_mass 1445
no. of finite means for NEA_st_age 2162
no. of finite means for CKSII_st_age 1445
no. of finite means for NEA_st_lum 786
no. of finite means for NEA_st_teff 4458
no. of finite means for CKSI_st_teff 1445
no. of finite means for CKSII_st_teff 1445


  percent_errs1 =  np.absolute( np.multiply(errs1, values**(-1)) ) * 100
  percent_errs2 =  np.absolute( np.multiply(errs2, values**(-1)) ) * 100
  percent_errs =  np.absolute( np.multiply(errs, values**(-1)) ) * 100


no. of finite means for NEA_st_met 3139
no. of finite means for NEA_pl_mass 1913
no. of finite means for NEA_pl_rad 3501
no. of finite means for CKSII_pl_rad 1443
no. of finite means for NEA_pl_sma 1995
no. of finite means for CKSII_pl_sma 1443
all flags:
           st_mass st_age      st_lum      st_teff      st_met      pl_mass  \
0     NEA_st_mass    NaN  NEA_st_lum  NEA_st_teff  NEA_st_met  NEA_pl_mass   
1     NEA_st_mass    NaN         NaN  NEA_st_teff         NaN  NEA_pl_mass   
2     NEA_st_mass    NaN         NaN  NEA_st_teff  NEA_st_met          NaN   
3     NEA_st_mass    NaN         NaN  NEA_st_teff         NaN  NEA_pl_mass   
4     NEA_st_mass    NaN         NaN  NEA_st_teff         NaN  NEA_pl_mass   
...           ...    ...         ...          ...         ...          ...   
5009          NaN    NaN         NaN          NaN         NaN  NEA_pl_mass   
5010          NaN    NaN         NaN          NaN         NaN  NEA_pl_mass   
5011          NaN    NaN         NaN     

In [13]:
# TODO: separate the flags and best values methods?
# add the flags to the exoplanets database? - Y, do this, could be useful for later
# then get list of best values?
# use the best values to make the cuts in plotting

# Save data:

In [None]:
exoplanets.to_csv( path.join( exoplanets_dir, f"exoplanets_" + data_date + f".csv"), index=False)