In [1]:
# This most of this should be deleted once GDR2_processing and NASAEA_processing are working
# All crossmatching should be done in this file, minimal pre-crossmatch processing

import numpy as np
import pandas as pd

from os import path

import astropy.table #import tables
from astropy import units as u
from astropy import constants as const

from preprocessing.calc_stellar_params import calc_luminosity, calc_temp
from preprocessing.analyse_errs import classify_err

In [2]:
# Exoplanet directory:
NASAEA_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/NASA_EA_2022_02_09.csv"
#NASAEA_dir = "~/OneDrive/SEPHI_data/NASA_EA_2022_02_09.csv"
header_length = 116

# Exoplanet hostnames from NASA EA directory:
host_names_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/host_names.csv"
source_ids_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/source_ids.csv"

data_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/"

# GAIA DR2 directory:
dr2_directory = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/dr2-result.csv"
#dr2_directory = "~/OneDrive/SEPHI_data/dr2-result.csv"

# CKS directories:
CKS_stars_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/CKSII_2017_stars.tsv"
#CKS_stars_dir = "~/OneDrive/SEPHI_data/CKSII_2017_stars.tsv"
CKS_col_start = 100 # the row where containing the colum headers
# The length of the header in the exoplanets csv file:

In [3]:
# Read NASA EA exoplanets data:
exoplanets = pd.read_csv(NASAEA_dir, skiprows=header_length, 
                         usecols=["pl_name", "hostname", "gaia_id", "sy_snum", "sy_pnum", "discoverymethod", "pl_orbper", "pl_orbsmax", "pl_rade", "pl_bmasse", "pl_dens", "pl_orbeccen", "pl_eqt", "pl_orbincl", "st_teff", "st_tefferr1", "st_tefferr2", "st_rad", "st_raderr1", "st_raderr2", "st_mass", "st_met", "st_lum", "st_lumerr1", "st_lumerr2", "st_logg", "st_age", "st_ageerr1", "st_ageerr2", "sy_dist", "sy_plx", "sy_gaiamag"])
# pl_orbper = orbital period [days]
# pl_orbsmax = orbit semi-major axis [au]

# Get rid of exoplanets whose star's haven't been observed by Gaia
exoplanets.dropna(subset=["gaia_id"], inplace=True)

# The gaia_id in exoplanets is called the 'designation' in Gaia tables, so rename it:
# The designation is the Gaia + space + DR + no. + space + Gaia source_id
# e.g. Gaia DR2 3946945413106333696
exoplanets.rename(columns={ "gaia_id": "designation" }, inplace=True )

# Add a new column called 'source_id' which is listed in Gaia tables
# The source_id is the number in the designation, e.g. 3946945413106333696
exoplanets["source_id"] = exoplanets["designation"].str.rsplit(" ", n=1, expand=True)[1].astype("int64")

# Remove the space in 'hostname' (e.g. 11 Com -> 11Com):
#exoplanets["hostname"] = exoplanets["hostname"].str.replace(" ", "")

print(exoplanets.shape[0])

4734


In [4]:
# Create a csv containing the designations:
exoplanets[["designation"]].to_csv(path.join(data_dir, f"designations.csv"), index=False)

# I used the designations to serch dr2 for stars in the NASA EA

In [5]:
dr2 = pd.read_csv(path.join(data_dir, f"dr2-all-exo-hosts2-result.csv"), usecols=["designation", "source_id", "teff_val", "teff_percentile_lower", "teff_percentile_upper", "radius_val", "radius_percentile_lower", "radius_percentile_upper", 
                                          "lum_val", "lum_percentile_lower", "lum_percentile_upper"])


In [6]:
print(dr2.head(100))

                     designation            source_id   teff_val  \
0   Gaia DR2 3946945413106333696  3946945413106333696  4755.0000   
1   Gaia DR2 1696798367260229376  1696798367260229376  4248.7000   
2   Gaia DR2 1920113512486282240  1920113512486282240  4740.0000   
3   Gaia DR2 1385293808145621504  1385293808145621504  5282.0000   
4   Gaia DR2 2135550755683407232  2135550755683407232  5777.2500   
..                           ...                  ...        ...   
95  Gaia DR2 3107471240241980800  3107471240241980800        NaN   
96  Gaia DR2 3107471240236703104  3107471240236703104  5031.7354   
97  Gaia DR2 4285572454497223296  4285572454497223296        NaN   
98  Gaia DR2 4285572454508521600  4285572454508521600        NaN   
99  Gaia DR2 4285572454508522496  4285572454508522496  5245.5234   

    teff_percentile_lower  teff_percentile_upper  radius_val  \
0               4680.0000              5067.0000   17.181000   
1               4139.0000              4510.7600   30.2

In [7]:
# Checking for duplicates in dr2:

# The length of dr2 is > the length of exoplanets, so there must be duplicates
print("Length of dr2: ", dr2.shape[0])

no_duplicates = dr2.duplicated().sum()
print("The number of duplicates: ", no_duplicates)
# All columns are duplicated for these rows, so any row can be deleted
print("The number of stars in dr2 without duplicates: ", dr2.shape[0] - no_duplicates)

# Deleting duplicates from dr2:
dr2.drop_duplicates(subset=None, keep="first", inplace=True, ignore_index=False)
print("The duplicates have been deleted. No. rows in dr2: ", dr2.shape[0])

Length of dr2:  5732
The number of duplicates:  1456
The number of stars in dr2 without duplicates:  4276
The duplicates have been deleted. No. rows in dr2:  4276


In [8]:
# NB: Gaia lum [lum solar], NASA EA lum [log(soalr lum)]

In [9]:
# Changing the dr2 uncertainties to +/- uncertainties

dr2["teff_percentile_upper"] = dr2["teff_percentile_upper"] - dr2["teff_val"] # should come out +ve
dr2["teff_percentile_lower"] = dr2["teff_percentile_lower"] - dr2["teff_val"] # should come out -ve

# Changing radius errors to +/- uncertainties:
dr2["radius_percentile_upper"] = dr2["radius_percentile_upper"] - dr2["radius_val"] # should come out +ve
dr2["radius_percentile_lower"] = dr2["radius_percentile_lower"] - dr2["radius_val"] # should come out -ve

# Changing luminosity errors to +/- uncertainties:
dr2["lum_percentile_upper"] = dr2["lum_percentile_upper"] - dr2["lum_val"] # should come out +ve
dr2["lum_percentile_lower"] = dr2["lum_percentile_lower"] - dr2["lum_val"] # should come out -ve

# Renaming the columns:
new_cols = {"teff_val": "gdr2_teff",
            "teff_percentile_upper": "gdr2_tefferr1",
           "teff_percentile_lower": "gdr2_tefferr2",
            "radius_val": "gdr2_rad",
           "radius_percentile_upper": "gdr2_raderr1",
           "radius_percentile_lower": "gdr2_raderr2",
            "lum_val": "gdr2_lum",
           "lum_percentile_upper": "gdr2_lumerr1",
           "lum_percentile_lower": "gdr2_lumerr2"}
dr2.rename( columns=new_cols, inplace=True )

# Delete the designation column from dr2
dr2.drop(labels="designation", axis=1, inplace=True)

In [10]:
print(dr2)

                source_id  gdr2_teff  gdr2_tefferr2  gdr2_tefferr1   gdr2_rad  \
0     3946945413106333696  4755.0000       -75.0000       312.0000  17.181000   
1     1696798367260229376  4248.7000      -109.7000       262.0600  30.262005   
2     1920113512486282240  4740.0000       -58.9000       106.5000  11.147492   
3     1385293808145621504  5282.0000       -72.6665       251.0000   1.003684   
4     2135550755683407232  5777.2500       -80.7500       112.7500   1.119800   
...                   ...        ...            ...            ...        ...   
5725   886479673641474304        NaN            NaN            NaN        NaN   
5726   886479673643096448  4408.3335       -68.3335        50.0000  31.158610   
5727   348020448377061376  6105.0000      -175.5000       114.3335   1.705353   
5730  3794167001116433152  4861.2500       -71.2500        78.5000  11.126557   
5731  4298361114750843904  4707.0000       -37.0000        36.0000  10.723029   

      gdr2_raderr2  gdr2_ra

In [11]:
# Merge with exoplanet df:
exoplanets = pd.merge(exoplanets, dr2, on="source_id", how="left")

In [12]:
print(exoplanets)

         pl_name  hostname                   designation  sy_snum  sy_pnum  \
0       11 Com b    11 Com  Gaia DR2 3946945413106333696        2        1   
1       11 UMi b    11 UMi  Gaia DR2 1696798367260229376        1        1   
2       14 And b    14 And  Gaia DR2 1920113512486282240        1        1   
3       14 Her b    14 Her  Gaia DR2 1385293808145621504        1        2   
4     16 Cyg B b  16 Cyg B  Gaia DR2 2135550755683407232        3        1   
...          ...       ...                           ...      ...      ...   
4729   ups And b   ups And   Gaia DR2 348020448377061376        2        3   
4730   ups And c   ups And   Gaia DR2 348020448377061376        2        3   
4731   ups And d   ups And   Gaia DR2 348020448377061376        2        3   
4732   ups Leo b   ups Leo  Gaia DR2 3794167001116433152        1        1   
4733    xi Aql b    xi Aql  Gaia DR2 4298361114750843904        1        1   

      discoverymethod    pl_orbper  pl_orbsmax  pl_rade   pl_bm