In [1]:
# Jupyter magic
%run ../util/dependencies.py

# Load the raw data as dataframe
file_path = '../data/ks_01_raw.csv'
ks = pd.read_csv(file_path)

# set visual style for plots during exploration
sns.set_palette('rocket')
sns.set_style("darkgrid")
plt.style.use("dark_background")

### Kepler Star Target Observations

In [None]:
print('=====================================')
print('\KEPLER STELLAR DB')
print('=====================================')


print('\n~~~~~~~~~~~~~~~~~')
print('Info')
print('~~~~~~~~~~~~~~~~~')
print(ks.info())

print('\n~~~~~~~~~~~~~~~~~')
print('Head')
print('~~~~~~~~~~~~~~~~~')
print(ks.head())

print('\n~~~~~~~~~~~~~~~~~')
print('Description')
print('~~~~~~~~~~~~~~~~~')
print(ks.describe())

print('\n~~~~~~~~~~~~~~~~~')
print('Missing Values')
print('~~~~~~~~~~~~~~~~~')
for col in ks.columns:
    per_missing = ks[col].isnull().sum() / len(ks) * 100
    print(f'{col}: {per_missing:.2f}% missing values')

print('\n~~~~~~~~~~~~~~~~~')
print('Value Counts')
print('~~~~~~~~~~~~~~~~~')
for col in ks:
    print(ks[col].value_counts())

A lot of dupplicates observed from the Kepler Stellar dataset (see 'Value Counts', above), however something that is troubling is that this particular dataset does not detail the stellar classification of each of the stars, unfortunately, a key aspect to the analysis I plan on performing. I may, again, need to rely upon approximate-classificaitons based on temperature, for the KR star classificaiton system.

In [None]:
ks = ks.drop_duplicates()

print("Duplicates removed. Number of records after deduplication:", len(ks))
print('\n~~~~~~~~~~~~~~~~~')
print('Value Counts')
print('~~~~~~~~~~~~~~~~~')
for col in ks:
    print(ks[col].value_counts())

Just by reviewing the 'kepid' feature, further data cleaning is required to remove duplicate entries and ensure that we are working with a clean dataset.

In [None]:
# Get a quick view of how duplicate kepid records differ when considering other features
print(ks[ks['kepid'] == 9995748])
print(ks[ks['kepid'] == 6965789])
print(ks[ks['kepid'] == 6965782])

The process for addressing these duplicates should be as follows:
1. Keep top kepid (should be uniform)
2. Keep top tm_designation (should be uniform)
3. Average (as mean) all of the following features (adjusting for outliers, and not including NaN):
    - teff
    - feh
    - radius
    - mass
    - dens
4. Select highest value nconfp, nkoi, and ntce