# Filter KOI Data

In [9]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [10]:
raw_df = pd.read_csv('../data/raw/KOI (KEPLER)/kepler_object_of_interest.csv', sep=',')
raw_df.head(3)

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,9.488036,2.8e-05,-2.8e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,,,93.59,29.45,-16.65,35.8,1.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,,,9.11,2.87,-1.62,25.8,2.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,19.89914,1.5e-05,-1.5e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638.0,,,39.3,31.04,-10.49,76.3,1.0,q1_q17_dr25_tce,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436


In [11]:
""" 
Filtering the columns that represent only data relevant to the model, 
excluding columns that do not provide new information, such as err1 and err2
and information about the object's rotation speed.
"""

useful_columns = [
    'koi_disposition',
    'koi_period',
    'koi_duration',
    'koi_depth',
    'koi_prad',
    'koi_insol',
    'koi_teq',
    'koi_steff',
    'koi_slogg',
    'koi_srad'
]

df = raw_df[useful_columns].copy()

df.head()

Unnamed: 0,koi_disposition,koi_period,koi_duration,koi_depth,koi_prad,koi_insol,koi_teq,koi_steff,koi_slogg,koi_srad
0,CONFIRMED,9.488036,2.9575,615.8,2.26,93.59,793.0,5455.0,4.467,0.927
1,CONFIRMED,54.418383,4.507,874.8,2.83,9.11,443.0,5455.0,4.467,0.927
2,CANDIDATE,19.89914,1.7822,10829.0,14.6,39.3,638.0,5853.0,4.544,0.868
3,FALSE POSITIVE,1.736952,2.40641,8079.2,33.46,891.96,1395.0,5805.0,4.564,0.791
4,CONFIRMED,2.525592,1.6545,603.3,2.75,926.16,1406.0,6031.0,4.438,1.046


In [12]:
print('Size: ', df.shape, '\nColumns:', df.columns)

Size:  (9564, 10) 
Columns: Index(['koi_disposition', 'koi_period', 'koi_duration', 'koi_depth',
       'koi_prad', 'koi_insol', 'koi_teq', 'koi_steff', 'koi_slogg',
       'koi_srad'],
      dtype='object')


In [13]:
new_columns_name = {
    'koi_disposition': 'STATUS',
    'koi_period': 'ORBITAL_PERIOD_DAYS',
    'koi_duration': 'TRANSIT_DURATION_HOURS',
    'koi_depth': 'TRANSIT_DEPTH_PPM',
    'koi_prad': 'PLANET_RADIUS_REARTH',
    'koi_insol': 'PLANET_INSOLATION_EFLUX',
    'koi_teq': 'PLANET_EQ_TEMP_K',
    'koi_steff': 'STELLAR_TEFF_K',
    'koi_slogg': 'STELLAR_LOGG_CMS2',
    'koi_srad': 'STELLAR_RADIUS_RSUN'
}

df = df.rename(columns=new_columns_name)
df

Unnamed: 0,STATUS,ORBITAL_PERIOD_DAYS,TRANSIT_DURATION_HOURS,TRANSIT_DEPTH_PPM,PLANET_RADIUS_REARTH,PLANET_INSOLATION_EFLUX,PLANET_EQ_TEMP_K,STELLAR_TEFF_K,STELLAR_LOGG_CMS2,STELLAR_RADIUS_RSUN
0,CONFIRMED,9.488036,2.95750,615.8,2.26,93.59,793.0,5455.0,4.467,0.927
1,CONFIRMED,54.418383,4.50700,874.8,2.83,9.11,443.0,5455.0,4.467,0.927
2,CANDIDATE,19.899140,1.78220,10829.0,14.60,39.30,638.0,5853.0,4.544,0.868
3,FALSE POSITIVE,1.736952,2.40641,8079.2,33.46,891.96,1395.0,5805.0,4.564,0.791
4,CONFIRMED,2.525592,1.65450,603.3,2.75,926.16,1406.0,6031.0,4.438,1.046
...,...,...,...,...,...,...,...,...,...,...
9559,FALSE POSITIVE,0.527699,3.22210,1579.2,29.35,4500.53,2088.0,5638.0,4.529,0.903
9560,CANDIDATE,1.739849,3.11400,48.5,0.72,1585.81,1608.0,6119.0,4.444,1.031
9561,FALSE POSITIVE,0.681402,0.86500,103.6,1.07,5713.41,2218.0,6173.0,4.447,1.041
9562,CANDIDATE,333.486169,3.19900,639.1,19.30,22.68,557.0,4989.0,2.992,7.824


In [14]:
pd.DataFrame(df.dtypes, columns=['dtype'])

Unnamed: 0,dtype
STATUS,object
ORBITAL_PERIOD_DAYS,float64
TRANSIT_DURATION_HOURS,float64
TRANSIT_DEPTH_PPM,float64
PLANET_RADIUS_REARTH,float64
PLANET_INSOLATION_EFLUX,float64
PLANET_EQ_TEMP_K,float64
STELLAR_TEFF_K,float64
STELLAR_LOGG_CMS2,float64
STELLAR_RADIUS_RSUN,float64


In [15]:
df.isna().sum()

STATUS                       0
ORBITAL_PERIOD_DAYS          0
TRANSIT_DURATION_HOURS       0
TRANSIT_DEPTH_PPM          363
PLANET_RADIUS_REARTH       363
PLANET_INSOLATION_EFLUX    321
PLANET_EQ_TEMP_K           363
STELLAR_TEFF_K             363
STELLAR_LOGG_CMS2          363
STELLAR_RADIUS_RSUN        363
dtype: int64

In [16]:
df.to_csv('../data/processed/1_filtered_koi_data.csv', index=False, sep=';')