In [1]:
import pandas as pd
import numpy as np

from os import path

from astropy.io import ascii

In [2]:
# Data directories:
CKS_data_dir = "~/OneDrive/SEPHI_data/CKS/"
#data_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/CKS/"

NEA_data_dir = "~/OneDrive/SEPHI_data/"
#data_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/"

rosetta_dir = "~/OneDrive/CKS_crossmatch/"
#rosetta_dir = "~/OneDrive - Liverpool John Moores University/CKS_crossmatch/"

In [3]:
# CKS column names are specified in https://sites.astro.caltech.edu/~howard/cks/column-definitions.txt
# The file used contains both the stellar (CKS) and planet (Kepler) properties

# The indexes are in CKS in column 0
index_column = 0

# These columns are used to identify the stars and their planets
identifiers = ["id_starname", "id_koicand", "id_kepler_name", "koi_disposition"]
#koi_disposition       Exoplanet Archive Disposition
#id_koi not actually in the csv

# Columns from Q19:
Q16 = ["koi_period", "koi_period_err1", "koi_period_err2", "koi_impact", "koi_impact_err1", "koi_impact_err2", "koi_prad", "koi_prad_err1", "koi_prad_err2", "koi_sma", "koi_sma_err1", "koi_sma_err2", "koi_slogg", "koi_slogg_err1", "koi_slogg_err2", "koi_smet", "koi_smet_err1", "koi_smet_err2", "koi_srad", "koi_srad_err1", "koi_srad_err2", "koi_smass", "koi_smass_err1", "koi_smass_err2", "koi_sage", "koi_sage_err1", "koi_sage_err2"]
# NB: the orbital period only directly appears in Q16
#koi_period            Orbital Period [days] 
#koi_impact            Impact Parameter [float]
#koi_ror               Planet-Star Radius Ratio [float]
#koi_srho              Fitted Stellar Density [g/cm**3]
#koi_prad              Planetary Radius [Earth radii]
#koi_sma               Orbit Semi-Major Axis [AU]
#koi_teq               Equilibrium Temperature [K]
#koi_insol             Insolation Flux [Earth flux]
#koi_dor               Planet-Star Distance over Star Radius [float]
#koi_model_snr         Transit Signal-to-Noise
#koi_count             Number of Planets
#koi_num_transits      Number of Transits
#koi_steff             Stellar Effective Temperature [K]
#koi_slogg             Stellar Surface Gravity [log10(cm/s**2)]
#koi_smet              Stellar Metallicity [dex]
#koi_srad              Stellar Radius [Solar radii

# Columns from CKS-I:
CKSI = ["cks_steff", "cks_steff_err1", "cks_steff_err2", "cks_slogg", "cks_slogg_err1", "cks_slogg_err2", "cks_smet", "cks_smet_err1", "cks_smet_err2"]
#cks_fpsys      CKS False positive designation for star/system
#cks_fp                CKS False positive designation for candidate
#cks_slogg             CKS Stellar Surface Gravity [log10(cm/s**2)]
#cks_fpsys not in csv

# Columns from CKS-II:
CKSII = ["iso_steff", "iso_steff_err1", "iso_steff_err2", "iso_slogg", "iso_slogg_err1", "iso_slogg_err2", "iso_smet", "iso_smet_err1", "iso_smet_err2", "iso_srad", "iso_srad_err1", "iso_srad_err2", "iso_smass", "iso_smass_err1", "iso_smass_err2", "iso_sage", "iso_sage_err1", "iso_sage_err2", "iso_prad", "iso_prad_err1", "iso_prad_err2", "iso_sma", "iso_sma_err1", "iso_sma_err2", "iso_insol", "iso_insol_err1", "iso_insol_err2", "iso_teq", "iso_teq_err1", "iso_teq_err2"]
#iso_steff             CKS+Isochrone-constrained Effective Temperature [K]
#iso_slogg             CKS+Isochrone-constrained Stellar Surface Gravity [log10(cm/s**2)]
#iso_insol             CKS+Isochrone-constrained Incident flux [Earth units]
#iso_teq               CKS+Isochrone-constrained Planet equilibrium temperature (bond albedo = 0.3)

# Choosing the data sets/columns to be used:
columns = identifiers # The column names I will read include the identifiers and 
columns.extend(CKSI) # CKSI and
columns.extend(CKSII) # CKSII
#print(columns)

# Read the CKS file:
CKS_data = pd.read_csv(path.join(CKS_data_dir, f"cks_physical_merged.csv"),  usecols=columns)#, index_col=index_column)
# usecols should always be identifiers.extend(<data-sample>) so that the stars are identifiable

In [4]:
# TODO: do not drop unconfirmed objects
# Drop unconfirmed objects from CKS_data (those with 'pl_kepler_name' = NaN):
#CKS_data.dropna(axis=0, subset=["id_kepler_name"], inplace=True)
# This now means that I can drop the "koi_disposition" column, as only the confirmed entries are left:
#CKS_data.drop(labels="koi_disposition", axis=1, inplace=True)

# Rename CKS column headings to make them easily identifiable:
# koi -> Q16 - except the koi_disposition which is the planet's (NEA) confirmed/not confirmed status
# cks -> CKSI
# iso -> CKSII
columns_new = [i.replace("koi_disposition", "status") for i in columns]
columns_new = [i.replace("koi_", "Q16_") for i in columns_new]
columns_new = [i.replace("_period", "_pperiod") for i in columns_new]
columns_new = [i.replace("_impact", "_pimpact") for i in columns_new]
columns_new = [i.replace("cks_", "CKSI_") for i in columns_new]
columns_new = [i.replace("iso_", "CKSII_") for i in columns_new]
#print(columns_new)

cols_dict = {columns[i]: columns_new[i] for i in range(len(columns))}
CKS_data.rename( columns=cols_dict, inplace=True )
CKS_data.rename( columns={"id_starname":"st_koi_name", "id_koicand":"pl_koi_name", "id_kepler_name":"pl_kepler_name"}, inplace=True )
CKS_length = CKS_data["st_koi_name"].size
print(CKS_data)

#TODO: you may also want to change the stellar parameter indicator from 's' to 'st' to align with NEA 
# TODO: and change "koi_sma" = Orbit Semi-Major Axis [AU] to something without 's' in

     st_koi_name pl_koi_name pl_kepler_name          status  CKSI_steff  \
0         K00001   K00001.01     Kepler-1 b       CONFIRMED      5818.8   
1         K00002   K00002.01     Kepler-2 b       CONFIRMED      6448.7   
2         K00003   K00003.01     Kepler-3 b       CONFIRMED      4864.3   
3         K00006   K00006.01            NaN  FALSE POSITIVE      6348.1   
4         K00007   K00007.01     Kepler-4 b       CONFIRMED      5826.9   
...          ...         ...            ...             ...         ...   
2020      K05929   K05929.01            NaN       CANDIDATE      5503.7   
2021      K05932   K05932.01            NaN       CANDIDATE      5430.6   
2022      K05949   K05949.01            NaN       CANDIDATE      5790.6   
2023      K05953   K05953.01            NaN       CANDIDATE      5094.2   
2024      K06102   K06102.01            NaN       CANDIDATE      6322.9   

      CKSI_steff_err1  CKSI_steff_err2  CKSI_slogg  CKSI_slogg_err1  \
0                60.0       

In [21]:
# Checking whether confirmed planets alone have Kepler planet names
# Current values are CANDIDATE, FALSE POSITIVE, NOT DISPOSITIONED or CONFIRMED.
# A not dispositioned value corresponds to objects for which the disposition tests have not yet been completed. A false positive has failed at least one of the tests described 
#in Batalha et al. (2012). A planetary candidate has passed all prior tests conducted to identify false positives, although this does not a priori mean that all possible tests have 
#been conducted. 

c1 = np.where(CKS_data["status"] == "CONFIRMED")
print(c1[0].size, "confirmed Kepler planets in CKS.")

c2 = np.where(CKS_data["status"] == "NOT DISPOSITIONED")
print(c2[0].size, "not dispositioned Kepler planets in CKS.")

print(c1[0].size + c2[0].size, "confirmed + not dispositioned Kepler planets in CKS.")

c3 = np.where(CKS_data["status"] == "FALSE POSITIVE")
print(c3[0].size, "false positive Kepler planets in CKS.")


c4 = np.where( (CKS_data["status"] == "FALSE POSITIVE") & CKS_data["pl_kepler_name"].notnull() )
print(c4[0].size)
# No false positives have kepler names

c5 = np.where( (CKS_data["status"] == "NOT DISPOSITIONED") & CKS_data["pl_kepler_name"].notnull() )
print(c5[0].size)
# No not dispositioned planets (tests not yet completed) have kepler names

c6 = np.where((CKS_data["status"] == "CANDIDATE") & CKS_data["pl_kepler_name"].notnull() )
print(c5[0].size)
# No candidates (yet to be confirmed) have kepler names

c7 = np.where( (CKS_data["status"] == "CONFIRMED") & CKS_data["pl_kepler_name"].isna() )
print(c7[0].size, "confirmed Kepler planets without Kepler names in CKS.")
# All confirmed planets have kepler names

c8 = np.where(CKS_data["pl_koi_name"].isna())
print("Planets in CKS with no pl_koi_name:", c8[0].size)
# all planets in CKS have koi names/numbers

1298 confirmed Kepler planets in CKS.
464 not dispositioned Kepler planets in CKS.
1762 confirmed + not dispositioned Kepler planets in CKS.
53 false positive Kepler planets in CKS.
0
0
0
0 confirmed Kepler planets without Kepler names in CKS.
Planets in CKS with no pl_koi_name: 0


In [6]:
# Read in rosetta:
rosetta = pd.read_csv(path.join(CKS_data_dir, f"kep_conf_names_2022_04_13.csv"), skiprows=8)
rosetta_len = rosetta["pl_name"].size

# Planet names in NEA format examples:
# KOI-13 b
# Kepler-10 c
# KIC 5095269 b

cols_renamed = {"kepler_name":"pl_kepler_name",
               "koi_name":"pl_koi_name",
               "kepid":"st_kepid"}

rosetta.rename( columns=cols_renamed, inplace=True )

# Creating new columns in rosetta that have identifiers in the same format that theya re in in the NASA EA:
rosetta["st_kepid_NEA"] = "KIC " + rosetta["st_kepid"].astype(str) # putting the planet's Kepler ID (/Kepler Input Catalogue no.) into the same format that it's in in the NASA EA (e.g. KIC 5095269 b)
rosetta["pl_kepid_NEA"] = rosetta["st_kepid_NEA"] + " " + rosetta["pl_kepler_name"].str[-1] # the planet's Kepler ID is the star's Kepler ID + planet letter

rosetta["st_koi_name"] = rosetta["pl_koi_name"].str[:-3] # deleting the .?? characters to get the KOI for the star
rosetta["st_koi_NEA"] = rosetta["st_koi_name"].str[1:] # putting the KOI into the same format that it's in in the NASA EA
rosetta["st_koi_NEA"] = "K0I-" + rosetta["st_koi_NEA"].str.lstrip('0').astype(str)

rosetta["st_kepler_name"] = rosetta["pl_kepler_name"].str.rsplit(" ", n=1, expand=True)[0] # deleting the planet letter to get the Kepler- star name
rosetta["st_name"] = rosetta["pl_name"].str.rsplit(" ", n=1, expand=True)[0] # deleting the planet letter to get the generic star name (that should be in the NASA EA)

# The pl_kepler_name and pl_name are unique to each planet and are listed for all confirmed planets

print("Rosetta:\n", rosetta)

# Save the rosetta file to a csv:
#rosetta.to_csv( path.join(rosetta_dir, f"rosetta.csv") , index=False)

Rosetta:
       st_kepid pl_koi_name pl_kepler_name       pl_name  st_kepid_NEA  \
0     11446443   K00001.01     Kepler-1 b      TrES-2 b  KIC 11446443   
1     11904151   K00072.01    Kepler-10 b   Kepler-10 b  KIC 11904151   
2     11904151   K00072.02    Kepler-10 c   Kepler-10 c  KIC 11904151   
3      6521045   K00041.02   Kepler-100 b  Kepler-100 b   KIC 6521045   
4      6521045   K00041.01   Kepler-100 c  Kepler-100 c   KIC 6521045   
...        ...         ...            ...           ...           ...   
2725   4035640   K01881.01   Kepler-995 b  Kepler-995 b   KIC 4035640   
2726   6205228   K01882.01   Kepler-996 b  Kepler-996 b   KIC 6205228   
2727  11758544   K01883.01   Kepler-997 b  Kepler-997 b  KIC 11758544   
2728  11413812   K01885.01   Kepler-998 b  Kepler-998 b  KIC 11413812   
2729   9549648   K01886.01   Kepler-999 b  Kepler-999 b   KIC 9549648   

        pl_kepid_NEA st_koi_name st_koi_NEA st_kepler_name     st_name  
0     KIC 11446443 b      K00001      K0

In [7]:
# Loading NEA:
NEA = pd.read_csv(path.join(NEA_data_dir, f"NASA_EA_processed.csv"))
NEA_len = NEA["pl_name"].size
# The pl_name is unique to each planet
print(NEA)

         pl_name  hostname                   designation  sy_snum  sy_pnum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2       14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   
...          ...       ...                           ...      ...      ...   
4729   ups And b   ups And   Gaia DR2 348020448377061376        2        3   
4730   ups And c   ups And   Gaia DR2 348020448377061376        2        3   
4731   ups And d   ups And   Gaia DR2 348020448377061376        2        3   
4732   ups Leo b   ups Leo  Gaia DR2 3794167001116433152        1        1   
4733    xi Aql b    xi Aql  Gaia DR2 4298361114750843904        1        1   

      discoverymethod  NEA_porbper  NEA_porbsmax  NEA_prade  NE

In [8]:
# Conditions
# TODO: see whether the merge on pl_name picked all of these out
# TODO: maybe update the NEA and see whether you get more matches

# Indicies where the planet is listed by its KOI (starting "KOI-") in the NEA:
c1 = np.where(NEA["pl_name"].str[0:3] == "KOI")
#print(c1)
print(c1[0].size)
#print(NEA["hostname"].iloc[c1[0]])
# match these using KOI

# Indicies where the planet is listed by its Kepler name (starting "Kepler-") in the NEA:
c2 = np.where(NEA["pl_name"].str[0:6] == "Kepler")
print(c2[0].size)
# match these using Kepler- name

# Indicies where the planet is listed by its Kepler ID (starting "KIC ") in the NEA  in NEA:
c3 = np.where(NEA["pl_name"].str[0:3] == "KIC")
#print(NEA["pl_name"].iloc[c3[0]])
print(c3[0].size)
# match these using KIC name

c1[0].size + c2[0].size + c3[0].size

29
2649
15


2693

In [9]:
# Making a trimmer version of rosetta:

# All planets have a planet name, Kepler name, and Kepler ID (when combined with star name). Not all have KOI names. 
# So the smaller version of rosetta will be planet name 
rosetta2 = pd.DataFrame(data= [ rosetta["pl_name"], rosetta["pl_koi_name"], rosetta["pl_kepler_name"] ]).transpose()
#TODO: I might not need to include pl_kepler_name in rosetta2
rosetta2_len = rosetta2["pl_name"].size
print(rosetta2)

           pl_name pl_koi_name pl_kepler_name
0         TrES-2 b   K00001.01     Kepler-1 b
1      Kepler-10 b   K00072.01    Kepler-10 b
2      Kepler-10 c   K00072.02    Kepler-10 c
3     Kepler-100 b   K00041.02   Kepler-100 b
4     Kepler-100 c   K00041.01   Kepler-100 c
...            ...         ...            ...
2725  Kepler-995 b   K01881.01   Kepler-995 b
2726  Kepler-996 b   K01882.01   Kepler-996 b
2727  Kepler-997 b   K01883.01   Kepler-997 b
2728  Kepler-998 b   K01885.01   Kepler-998 b
2729  Kepler-999 b   K01886.01   Kepler-999 b

[2730 rows x 3 columns]


In [10]:
# Merging the NEA and rosetta:
exoplanets = pd.merge(NEA, rosetta2, on="pl_name", how="left", indicator="NEAorRos", validate="one_to_one")
# indicator. bool or str, default False. If True, adds a column to the output DataFrame called “_merge” with information on the source of each row. 
# validate. str, optional. If specified, checks if merge is of specified type. “one_to_one” or “1:1”: check if merge keys are unique in both left and right datasets.
exoplanets_len = exoplanets["pl_name"].size
print(exoplanets)

# How many NEA planets were successfully matched with rosetta2?
# Use the _merge column (from indicator=True)
# The column will have a Categorical type with the value of “left_only” for observations whose merge key only appears in the left DataFrame, 
#“right_only” for observations whose merge key only appears in the right DataFrame, and “both” if the observation’s merge key is found in both DataFrames.

# The indicies of Kepler planets in the NEA:
in_rosetta = np.where(exoplanets["NEAorRos"] == "both")
print("Number of Kepler (rosetta2) planets in the NEA:", in_rosetta[0].size, "out of", rosetta2_len, "Kepler planets.")

# Then this must include planets that aren't CONFIRMED.
# TODO: don't delete unconfirmed planets from CKS
# Then I can't use pl_kepler_name to match exoplanets with cks...
# What else should I use? The conditions I wrote? That doesn't include planets with different names though :'(

         pl_name  hostname                   designation  sy_snum  sy_pnum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2       14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   
...          ...       ...                           ...      ...      ...   
4729   ups And b   ups And   Gaia DR2 348020448377061376        2        3   
4730   ups And c   ups And   Gaia DR2 348020448377061376        2        3   
4731   ups And d   ups And   Gaia DR2 348020448377061376        2        3   
4732   ups Leo b   ups Leo  Gaia DR2 3794167001116433152        1        1   
4733    xi Aql b    xi Aql  Gaia DR2 4298361114750843904        1        1   

      discoverymethod  NEA_porbper  NEA_porbsmax  NEA_prade  NE

In [11]:
# Crossmatching the exoplanets df with CKS:
exoplanets = pd.merge(exoplanets, CKS_data, on="pl_koi_name", how="left", indicator="NEAorCKS")
print(exoplanets)

# The indicies of CKS planets in exoplanets database:
in_CKS = np.where(exoplanets["NEAorCKS"] == "both")
print("Number of CKS planets crossmatched with the exoplanets df (i.e. the NEA atm):", in_CKS[0].size, "out of", CKS_length, "Kepler planets in CKS.")
# TODO: why have so few planets been matched???
# I feel like all CKS planets should be in the NEA...
# TODO: how many CKS planets have pl_koi_name ?

         pl_name  hostname                   designation  sy_snum  sy_pnum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2       14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   
...          ...       ...                           ...      ...      ...   
4729   ups And b   ups And   Gaia DR2 348020448377061376        2        3   
4730   ups And c   ups And   Gaia DR2 348020448377061376        2        3   
4731   ups And d   ups And   Gaia DR2 348020448377061376        2        3   
4732   ups Leo b   ups Leo  Gaia DR2 3794167001116433152        1        1   
4733    xi Aql b    xi Aql  Gaia DR2 4298361114750843904        1        1   

      discoverymethod  NEA_porbper  NEA_porbsmax  NEA_prade  NE

In [12]:
# TODO:
# load the CKS data
# crossmatch exoplanets w cks on kepler_name/pl_koi_name
# how many planets were successfully crossmatched?
# see whether you can match any more kepler planets by
# a) using some of the np.where() stuff - make a plan as to how you would do this
# b) redownloading NEA
# move rosetta generation and crossmatch with NEA to NEA processing
# save rosetta to a csv
# save exoplanets with kepler names to a  csv in NEA processing