## Dependencies

In [None]:
# Manipulation
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Load and Examine Data

In [None]:
exo2 = pd.read_csv('Data/exoplanet_data.csv')

In [None]:
exo2.sample(5)

In [None]:
exo2.shape

In [None]:
exo2.columns

In [None]:
ax = sns.countplot(exo2['koi_disposition'])

In [None]:
values = exo2.drop(['koi_disposition'], axis=1)

In [None]:
fig = plt.figure(figsize=(15, 12))

for i in range(1, len(values.columns)):
    plt.subplot(6, 8, i)
    f = plt.gca()
    f.axes.get_yaxis().set_visible(False)

    vals = np.size(values.iloc[:, i].unique())
    if vals < 10:
        bins = vals
    else:
        vals = 10

    plt.hist(values.iloc[:, i], bins=25, color='#3F5D7D')
    plt.xlabel(values.columns[i])

plt.tight_layout()

In [None]:
exo2.describe()

In [None]:
fig = plt.figure(figsize=(15,60))

for i in range(1, len(exo2.columns)):
    
    plt.subplot(20, 5, i)

    ax = sns.boxplot(exo2['koi_disposition'], exo2.iloc[:, i])
    plt.xticks(rotation = 45)     

plt.tight_layout()

In [None]:
exo2.info()

## Feature Selection

In [None]:
skip_columns = [
    'koi_disposition',
    'koi_fpflag_nt',
    'koi_fpflag_ss',
    'koi_fpflag_co',
    'koi_fpflag_ec'
]

keep_columns = [
    'koi_period',
    'koi_time0bk',
    'koi_impact',
    'koi_duration',
    'koi_depth',
    'koi_prad',
    'koi_teq',
    'koi_insol',
    'koi_model_snr',
    'koi_tce_plnt_num',
    'koi_steff',
    'koi_slogg',
    'koi_srad',
    'ra',
    'dec',
    'koi_kepmag'
]

features = exo2[keep_columns]
response = exo2[skip_columns]

## Data Transformation

In [None]:
trans_data = pd.DataFrame()

for (col, data) in features.items():
    
    log_label = 'log_' + str(col)
    sq_label = 'sq_' + str(col)
    
    if col in skip_columns:
        continue    
        
    elif data.min() == 0:   
        trans_data[log_label] = np.log(exo2[col] + 0.001)
        
    elif data.max() == 0:
        trans_data[log_label] = np.log(np.absolute(exo2[col] + 0.001))
        
    else:
        trans_data[log_label] = np.log(exo2[col])

trans_data = trans_data.apply(zscore)

In [None]:
trans_data.sample(5)

In [None]:
trans_data.describe()

In [None]:
fig = plt.figure(figsize=(15, 12))

for i in range(1, len(trans_data.columns)):
    plt.subplot(9, 9, i)

    plt.hist(trans_data.iloc[:, i], bins=50, color='#3F5D7D')
    plt.xlabel(trans_data.columns[i])

plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(15,60))

for i in range(1, len(trans_data.columns)):
    
    plt.subplot(20, 5, i)

    ax = sns.boxplot(response['koi_disposition'], trans_data.iloc[:, i])
    plt.xticks(rotation = 45)     

plt.tight_layout()

In [None]:
export = pd.concat([response, trans_data], axis=1)

In [None]:
export.sample(10)

In [None]:
#export.to_csv('transformed_features.csv')

## Sparse Matrix

In [None]:
qual = pd.DataFrame()

for (col, data) in trans_data.items():
    
    label = 'qual_' + str(col)
    
    qual[label] = pd.cut(data, 2, labels = ['lower', 'upper'])
    
qual = pd.get_dummies(qual)

In [None]:
qual.sample(10)

In [None]:
sparse = pd.concat([response, qual], axis=1)

In [None]:
sparse.sample(10)

In [None]:
#sparse.to_csv('sparse.csv')