In [30]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd

In [31]:
kepler_df = pd.read_csv('../../data/exoplanet_data.csv')
kepler_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [32]:
kepler_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

In [33]:
print(f"The dataframe has a length of {len(kepler_df.columns)} columns.")
print(f"There are 3 outcomes/predictions/targets: {set(kepler_df['koi_disposition'])}")

The dataframe has a length of 41 columns.
There are 3 outcomes/predictions/targets: {'CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE'}


In [34]:
kepler_df['koi_disposition'] = kepler_df['koi_disposition'].replace({'CONFIRMED': 0, 'FALSE POSITIVE': 1, 'CANDIDATE': 2})

In [35]:
correlation = kepler_df.corr()
correlation['koi_disposition']

koi_disposition      1.000000
koi_fpflag_nt        0.000416
koi_fpflag_ss        0.013503
koi_fpflag_co        0.008531
koi_fpflag_ec        0.008041
koi_period           0.124647
koi_period_err1      0.099048
koi_period_err2     -0.099048
koi_time0bk          0.070445
koi_time0bk_err1     0.147719
koi_time0bk_err2    -0.147719
koi_impact           0.010607
koi_impact_err1      0.058572
koi_impact_err2     -0.013980
koi_duration         0.029554
koi_duration_err1    0.156587
koi_duration_err2   -0.156587
koi_depth            0.008694
koi_depth_err1       0.001797
koi_depth_err2      -0.001797
koi_prad             0.001485
koi_prad_err1        0.003135
koi_prad_err2       -0.000998
koi_teq              0.021275
koi_insol            0.012070
koi_insol_err1       0.014604
koi_insol_err2      -0.014159
koi_model_snr       -0.016351
koi_tce_plnt_num    -0.095550
koi_steff            0.071048
koi_steff_err1       0.173227
koi_steff_err2      -0.148902
koi_slogg           -0.071437
koi_slogg_

In [36]:
import plotly
import plotly.graph_objs as go
import plotly.figure_factory as figfac
from plotly.offline import iplot

iplot(figfac.create_annotated_heatmap(correlation.iloc[:10, :10].round(3).values, x=correlation.iloc[:10, :10].columns.to_list(), 
                                  y=correlation.iloc[:10, :10].index.to_list(), annotation_text=correlation.iloc[:10, :10].round(4).values, colorscale= 'bluered'))

In [37]:
features = kepler_df.drop('koi_disposition', axis = 1)
target = kepler_df['koi_disposition']

print(features.shape)
print(target.shape)

(6991, 40)
(6991,)


In [38]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [40]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [41]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, n_features_to_select=10, step=1)
features_selected = rfe.fit(X_train, y_train)

In [42]:
relevant_features = features.loc[:,features_selected.support_]
relevant_features.head()

Unnamed: 0,koi_period,koi_time0bk,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_model_snr,koi_steff_err1,ra
0,54.418383,162.51384,35.5,-35.5,2.83,0.32,-0.19,25.8,81,291.93423
1,19.89914,175.850252,171.0,-171.0,14.6,3.92,-1.31,76.3,158,297.00482
2,1.736952,170.307565,12.8,-12.8,33.46,8.5,-2.83,505.6,157,285.53461
3,2.525592,171.59555,16.9,-16.9,2.75,0.88,-0.35,40.9,169,288.75488
4,4.134435,172.97937,18.7,-18.7,2.77,0.9,-0.3,40.2,189,296.28613


In [50]:
relevant_corr_df = pd.merge(target, relevant_features, left_index=True, right_index=True)
relevant_corr_df.head()

Unnamed: 0,koi_disposition,koi_period,koi_time0bk,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_model_snr,koi_steff_err1,ra
0,0,54.418383,162.51384,35.5,-35.5,2.83,0.32,-0.19,25.8,81,291.93423
1,1,19.89914,175.850252,171.0,-171.0,14.6,3.92,-1.31,76.3,158,297.00482
2,1,1.736952,170.307565,12.8,-12.8,33.46,8.5,-2.83,505.6,157,285.53461
3,0,2.525592,171.59555,16.9,-16.9,2.75,0.88,-0.35,40.9,169,288.75488
4,0,4.134435,172.97937,18.7,-18.7,2.77,0.9,-0.3,40.2,189,296.28613


In [51]:
relevant_corr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6991 entries, 0 to 6990
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   koi_disposition  6991 non-null   int64  
 1   koi_period       6991 non-null   float64
 2   koi_time0bk      6991 non-null   float64
 3   koi_depth_err1   6991 non-null   float64
 4   koi_depth_err2   6991 non-null   float64
 5   koi_prad         6991 non-null   float64
 6   koi_prad_err1    6991 non-null   float64
 7   koi_prad_err2    6991 non-null   float64
 8   koi_model_snr    6991 non-null   float64
 9   koi_steff_err1   6991 non-null   int64  
 10  ra               6991 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 600.9 KB


In [52]:
rel_correlation = relevant_corr_df.corr()
rel_correlation['koi_disposition']

koi_disposition    1.000000
koi_period         0.124647
koi_time0bk        0.070445
koi_depth_err1     0.001797
koi_depth_err2    -0.001797
koi_prad           0.001485
koi_prad_err1      0.003135
koi_prad_err2     -0.000998
koi_model_snr     -0.016351
koi_steff_err1     0.173227
ra                 0.063848
Name: koi_disposition, dtype: float64

In [54]:
iplot(figfac.create_annotated_heatmap(rel_correlation.round(3).values, x=rel_correlation.columns.to_list(), 
                                  y=rel_correlation.index.to_list(), annotation_text=rel_correlation.round(4).values, colorscale= 'ylgnbu'))