In [1]:
import numpy as np
import pandas as pd

from os import path

In [2]:
# Data directory:
#data_dir = "~/Scarlett/OneDrive - Liverpool John Moores University/SEPHI_data/"
data_dir = "~/OneDrive/SEPHI_data/"

In [3]:
# Read the dr2 data:
dr2 = pd.read_csv(path.join(data_dir, f"dr2-all-exo-hosts2-result.csv"), usecols=["designation", "source_id", "teff_val", "teff_percentile_lower", "teff_percentile_upper", "radius_val", "radius_percentile_lower", "radius_percentile_upper", 
                                          "lum_val", "lum_percentile_lower", "lum_percentile_upper"])

In [4]:
# Checking for duplicates in dr2:

# The length of dr2 is > the length of exoplanets, so there must be duplicates
print("Length of dr2: ", dr2.shape[0])

no_duplicates = dr2.duplicated().sum()
print("The number of duplicates: ", no_duplicates)
# All columns are duplicated for these rows, so any row can be deleted
print("The number of stars in dr2 without duplicates: ", dr2.shape[0] - no_duplicates)

# Deleting duplicates from dr2:
dr2.drop_duplicates(subset=None, keep="first", inplace=True, ignore_index=False)
print("The duplicates have been deleted. No. rows in dr2: ", dr2.shape[0])

Length of dr2:  5732
The number of duplicates:  1456
The number of stars in dr2 without duplicates:  4276
The duplicates have been deleted. No. rows in dr2:  4276


In [5]:
# Changing the dr2 uncertainties to +/- uncertainties

dr2["teff_percentile_upper"] = dr2["teff_percentile_upper"] - dr2["teff_val"] # should come out +ve
dr2["teff_percentile_lower"] = dr2["teff_percentile_lower"] - dr2["teff_val"] # should come out -ve

# Changing radius errors to +/- uncertainties:
dr2["radius_percentile_upper"] = dr2["radius_percentile_upper"] - dr2["radius_val"] # should come out +ve
dr2["radius_percentile_lower"] = dr2["radius_percentile_lower"] - dr2["radius_val"] # should come out -ve

# Changing luminosity errors to +/- uncertainties:
dr2["lum_percentile_upper"] = dr2["lum_percentile_upper"] - dr2["lum_val"] # should come out +ve
dr2["lum_percentile_lower"] = dr2["lum_percentile_lower"] - dr2["lum_val"] # should come out -ve

# Renaming the columns:
new_cols = {"teff_val": "gdr2_teff",
            "teff_percentile_upper": "gdr2_tefferr1",
           "teff_percentile_lower": "gdr2_tefferr2",
            "radius_val": "gdr2_rad",
           "radius_percentile_upper": "gdr2_raderr1",
           "radius_percentile_lower": "gdr2_raderr2",
            "lum_val": "gdr2_lum",
           "lum_percentile_upper": "gdr2_lumerr1",
           "lum_percentile_lower": "gdr2_lumerr2"}
dr2.rename( columns=new_cols, inplace=True )

# Delete the designation column from dr2
dr2.drop(labels="designation", axis=1, inplace=True)

In [6]:
# Saving the processed dr2 data to a csv:
dr2.to_csv( path.join(data_dir, f"dr2-processed.csv") , index=False)