In [3]:
# Jupyter magic
%run ../util/dependencies.py

# Load the raw data as dataframe
file_path = '../data/ks_01_raw.csv'
ks = pd.read_csv(file_path)

# set visual style for plots during exploration
sns.set_palette('rocket')
sns.set_style("darkgrid")
plt.style.use("dark_background")

### Kepler Star Target Observations

In [5]:
print('=====================================')
print('\KEPLER STELLAR DB')
print('=====================================')


print('\n~~~~~~~~~~~~~~~~~')
print('Info')
print('~~~~~~~~~~~~~~~~~')
print(ks.info())

print('\n~~~~~~~~~~~~~~~~~')
print('Head')
print('~~~~~~~~~~~~~~~~~')
print(ks.head())

print('\n~~~~~~~~~~~~~~~~~')
print('Description')
print('~~~~~~~~~~~~~~~~~')
print(ks.describe())

print('\n~~~~~~~~~~~~~~~~~')
print('Missing Values')
print('~~~~~~~~~~~~~~~~~')
for col in ks.columns:
    per_missing = ks[col].isnull().sum() / len(ks) * 100
    print(f'{col}: {per_missing:.2f}% missing values')

print('\n~~~~~~~~~~~~~~~~~')
print('Value Counts')
print('~~~~~~~~~~~~~~~~~')
for col in ks:
    print(ks[col].value_counts())

  print('\KEPLER STELLAR DB')


\KEPLER STELLAR DB

~~~~~~~~~~~~~~~~~
Info
~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990244 entries, 0 to 990243
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   kepid           990244 non-null  int64  
 1   tm_designation  988042 non-null  object 
 2   teff            990244 non-null  int64  
 3   feh             990244 non-null  float64
 4   radius          990244 non-null  float64
 5   mass            787129 non-null  float64
 6   dens            787129 non-null  float64
 7   nconfp          990244 non-null  int64  
 8   nkoi            990244 non-null  int64  
 9   ntce            990244 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 75.5+ MB
None

~~~~~~~~~~~~~~~~~
Head
~~~~~~~~~~~~~~~~~
      kepid           tm_designation  teff  feh  radius  mass  dens  nconfp  \
0  10002413  2MASS J19103477+4657082  5708 -0.2   0.956   NaN   NaN       0   
1  10002

A lot of dupplicates observed from the Kepler Stellar dataset (see 'Value Counts', above), however something that is troubling is that this particular dataset does not detail the stellar classification of each of the stars, unfortunately, a key aspect to the analysis I plan on performing. I may, again, need to rely upon approximate-classificaitons based on temperature, for the KR star classificaiton system.

In [6]:
ks = ks.drop_duplicates()

print("Duplicates removed. Number of records after deduplication:", len(ks))
print('\n~~~~~~~~~~~~~~~~~')
print('Value Counts')
print('~~~~~~~~~~~~~~~~~')
for col in ks:
    print(ks[col].value_counts())

Duplicates removed. Number of records after deduplication: 677750

~~~~~~~~~~~~~~~~~
Value Counts
~~~~~~~~~~~~~~~~~
kepid
4169315    5
6966132    5
6965782    5
6965789    5
9953851    5
          ..
8235024    1
8299231    1
8296111    1
8301649    1
8547781    1
Name: count, Length: 200038, dtype: int64
tm_designation
2MASS J19281409+4355308    5
2MASS J19512531+4937587    5
2MASS J19512874+4936598    5
2MASS J19150907+4938233    5
2MASS J19561019+4358201    5
                          ..
2MASS J19290748+4049156    1
2MASS J19562168+4056217    1
2MASS J19513319+4049095    1
2MASS J19402911+4109069    1
2MASS J19320828+4108090    1
Name: count, Length: 199469, dtype: int64
teff
5780     15473
6343      2260
6063      1545
6108      1462
6167      1340
         ...  
9347         1
3240         1
8716         1
10594        1
7929         1
Name: count, Length: 6493, dtype: int64
feh
-0.200    173237
 0.070     32907
 0.000     26969
-0.100     23037
-0.120     19695
           ...  
-

Just by reviewing the 'kepid' feature, further data cleaning is required to remove duplicate entries and ensure that we are working with a clean dataset.

In [7]:
# Get a quick view of how duplicate kepid records differ when considering other features
print(ks[ks['kepid'] == 9995748])
print(ks[ks['kepid'] == 6965789])
print(ks[ks['kepid'] == 6965782])

          kepid           tm_designation  teff  feh  radius    mass   dens  \
194432  9995748  2MASS J18550787+4659221  6211 -0.2   1.027     NaN    NaN   
218932  9995748  2MASS J18550787+4659221  6205 -0.2   0.997  1.0527  1.498   
566381  9995748  2MASS J18550787+4659221  6205 -0.2   0.997  1.0530  1.498   
792459  9995748  2MASS J18550787+4659221  6214 -0.2   1.003  1.0490  1.462   

        nconfp  nkoi  ntce  
194432       0     0     0  
218932       0     0     0  
566381       0     0     0  
792459       0     0     0  
          kepid           tm_designation  teff   feh  radius    mass     dens  \
121933  6965789  2MASS J19454130+4229343  7880 -0.20   5.325     NaN      NaN   
303875  6965789  2MASS J19454130+4229343  7874 -2.00   4.573  1.7893  0.02636   
499358  6965789  2MASS J19454130+4229343  7874 -2.00   4.573  1.7890  0.02636   
718809  6965789  2MASS J19454130+4229343  7897 -1.96   4.266  1.7270  0.03134   
872175  6965789  2MASS J19454130+4229343  7897 -1.96   4.26

The process for addressing these duplicates should be as follows:
1. Keep top kepid (should be uniform)
2. Keep top tm_designation (should be uniform)
3. Average (as mean) all of the following features (adjusting for outliers, and not including NaN):
    - teff
    - feh
    - radius
    - mass
    - dens
4. Select highest value nconfp, nkoi, and ntce

In [None]:
for record in ks:
    if duplicates_exist(record['kepid']):
        # Select top record as default
        list_record_index = list(duplicates_of_record['kepid'])
        my_new_record = list_record_index[0]

        # Select most likely tm_designation (it does not appear that any of the tm_designations are mis-entered, but lets assume that they could be)
        dict_tmdes_count = dict()
        for i in list_record_index['tm_designation']:
            if i is not NULL:
                dict_tmdes_count.update() = dict_tmdes_count + 1
        my_new_record['tm_designation'] = highest count

        # Average the following features:
        # teff
        list_teff = list_record_index['teff']
        my_new_record['teff'] = avg(list_teff)
        # feh
        list_feh = list_record_index['feh']
        my_new_record['feh'] = avg(list_feh)
        # radius
        list_radius = list_record_index['radius']
        my_new_record['radius'] = avg(list_radius)
        # mass
        list_mass = list_record_index['mass']
        my_new_record['mass'] = avg(list_mass)
        # dens
        list_dens = list_record_index['dens']
        my_new_record['dens'] = avg(list_dens)

        # Select highest value for nconfp
        list_nconfp = list_record_index['nconfp']
        my_new_record['nconfp'] = high(list_teff)
        # nkoi
        list_nkoi = list_record_index['nkoi']
        my_new_record['nkoi'] = high(list_nkoi)
        # ntce
        list_ntce = list_record_index['ntce']
        my_new_record['ntce'] = high(list_ntce)