In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [160]:
data = pd.read_csv('data/cleaned_5272.csv', header=0)

In [161]:
# important variables
important_vars = ['pl_orbper', 'pl_orbeccen', 'pl_orbsmax', 'pl_rade', 'pl_bmasse']

# thrown variables
thrown_vars = ['pl_name', 'hostname', 'discoverymethod', 'disc_year', 'sy_dist']

# drop thrown variables
data.drop(thrown_vars, axis=1, inplace=True)

In [162]:
data['st_spectype'].unique()

array(['G8 III', nan, 'K0 III', 'K3 III', 'G6 III', 'K7 V', 'G3 IV',
       'M V', 'M5.5/M6', 'M3.5 V', 'M6', 'M8.5', 'M8', 'K1 III',
       'K1.5 III', 'F0 IV', 'G5 V', 'G3 III', 'G5 III', 'K2 V', 'K0',
       'K2 II', 'K5 III', 'K1', 'K3', 'G0', 'K7/M0 V', 'K3 V', 'K5 V',
       'M5.0 V', 'M9-L1', 'M3.5', 'K7', 'M3 V', 'G0 V', 'K1 V', 'F6 V',
       'F9 V', 'G2 V', 'G9 V', 'F8 IV', 'G3 V', 'F9', 'G5', 'G2',
       'G8/9 IV', 'K0 V', 'F3 V', 'G2 IV', 'G0 VI', 'L1.5', 'M0.5',
       'F8 V', 'F5 V', 'G6 V', 'G8 IV/V', 'M7.25', 'M4.0 V', 'M2.5 V',
       'M5.5 V', 'M 4.5V', 'M4 V', 'M4.5 V', 'K4.5', 'M1', 'M0 V', 'M2 V',
       'M3.0 V', 'M0.5 V', 'M0', 'M2.5', 'M4', 'M1.0 V', 'M2.0 V', 'M4.0',
       'M3.5 Ve', 'M0.5-1.0 V', 'M1 V', 'M2', 'M1.5', 'M1.5 V', 'F2',
       'K7e V', 'M3', 'K4', 'F8', 'K2', 'G3', 'K', 'G V', 'K V', 'G', 'F',
       'B9 Vne', 'G9 III', 'G4 V', 'K2 III', 'K9 V', 'G4 IV', 'F5',
       'G1 V', 'K0 IV-V', 'F8 IV/V', 'K1 IV', 'G3 IV-V', 'K3.5 V', 'G8 V',
       'F7

In [163]:
order = ['O', 'B', 'A', 'F', 'G', 'K', 'M'][::-1]

In [164]:
# keep only first letter of spectral type and fill missing value with the most common spectral type
data['st_spectype'] = data['st_spectype'].str[0]

In [165]:
mapping = {label: index for index, label in enumerate(order)}

In [166]:
data['st_spectype'] = data['st_spectype'].map(mapping)

In [167]:
# encode categorical variables
met_le = LabelEncoder()
data['st_metratio'] = met_le.fit_transform(data['st_metratio'])

type_le = LabelEncoder()
data['planet_type'] = type_le.fit_transform(data['planet_type'])

In [168]:
#impute missing values
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)

In [169]:
orbital_features = ['pl_orbper', 'pl_orbsmax', 'st_mass']
data[orbital_features] = imputer.fit_transform(data[orbital_features])

In [170]:
planet_features = ['pl_rade', 'pl_bmasse', 'planet_type']
data[planet_features] = imputer.fit_transform(data[planet_features])

In [171]:
stellar_features = ['st_rad', 'st_teff', 'st_mass', 'st_spectype', 'st_logg', 'st_metratio']
data[stellar_features] = imputer.fit_transform(data[stellar_features])

In [172]:
data[data.columns] = imputer.fit_transform(data[data.columns])

In [173]:
# compute the variance of kepler ratio 
(data['pl_orbper']**2 / data['pl_orbsmax']**3 * data['st_mass']).std()

245818.75407164308

In [174]:
# transform back planet type
data['planet_type'] = type_le.inverse_transform(data['planet_type'].astype(int))

In [175]:
# save X to csv
data.to_csv('data/imputed_5272.csv', index=False)