# Exoplanet Convolutional Neural Network (CNN) Classifier
Here we will use a CNN to classify if a star observed by Kepler contains a exoplanet 

In [1]:
import tensorflow.keras.models
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
import pandas as pd
import matplotlib.pyplot as plt
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# from keras.layers import Embedding
# from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


First lets load the data and preprocess it.

In [9]:
def load_data(path):
    data = pd.read_csv(path)

    # Convert columns to something more useful.
    for i, column in enumerate(data.columns):
        if 'FLUX' in column:
            data.columns.values[i] = int(column.split('.')[1])
    # Also change labels so 0 is non-exoplanet and 1 is exoplanet
    data.LABEL -= 1
    return data

def normalize_data(df):
    """
    Normalize a Kepler light curve DatFrame with a mean subtraction and 
    """
    # Calculate the mean of each light curve and subtract it
    df_norm = df.copy()
    mean = df_norm.iloc[:, 1:].mean(axis=1)
    df_norm.iloc[:, 1:] = df_norm.iloc[:, 1:].subtract(mean, axis=0)
    # Calculate the standard deviation of each light curve and divide.
    std = df_norm.iloc[:, 1:].std(axis=1)
    df_norm.iloc[:, 1:] = df_norm.iloc[:, 1:].divide(std, axis=0)
    return df_norm


train_data = load_data('./data/exoTrain.csv')
train_data = normalize_data(train_data)

test_data = load_data('./data/exoTest.csv')
test_data = normalize_data(test_data)

In [10]:
train_data.head()

Unnamed: 0,LABEL,1,2,3,4,5,6,7,8,9,...,3188,3189,3190,3191,3192,3193,3194,3195,3196,3197
0,1,0.521974,0.459509,0.063126,-0.22979,-0.308059,-0.837834,-0.902975,-0.66089,-0.558979,...,-0.547656,-0.697473,-0.697473,0.094421,0.240257,0.513824,0.182707,0.320205,-0.030324,-0.307935
1,1,-0.524105,-0.439954,-0.851711,-0.544268,-1.197814,-1.089501,-1.318459,-1.298129,-1.275467,...,0.06912,-0.412959,-0.412959,-0.290981,0.042791,0.13644,-0.071188,0.231423,0.390394,0.455882
2,1,1.86057,1.872206,1.793484,1.733848,1.590274,1.624154,1.618833,1.696491,1.519711,...,-0.283383,0.018167,0.018167,-0.135091,-0.103127,-0.011101,-0.070914,-0.131615,-0.277459,-0.372004
3,1,3.207775,3.409928,2.973657,2.932781,3.122729,3.07391,3.167189,3.060446,3.071198,...,0.100317,0.008878,0.008878,0.336081,0.239025,-0.077717,-0.039941,-0.122661,-0.123049,0.180422
4,1,-2.684628,-2.697843,-2.713465,-2.654883,-2.56265,-2.505984,-2.417214,-2.477073,-2.395673,...,-1.424955,-0.951607,-0.951607,-0.842499,-1.055016,-1.042194,-0.946817,-0.909826,-0.976489,-1.219045


In [11]:
test_data.head()

Unnamed: 0,LABEL,1,2,3,4,5,6,7,8,9,...,3188,3189,3190,3191,3192,3193,3194,3195,3196,3197
0,1,3.996087,3.336384,2.875229,1.608144,1.522286,1.296571,0.5983,0.209588,0.19785,...,0.462469,0.622447,0.459785,-0.078843,0.422558,1.501492,1.046374,1.175497,9.011772,1.911332
1,1,3.429247,3.406992,3.417435,3.402585,3.385016,3.365156,3.362257,3.327655,3.316087,...,-0.411742,-0.656225,-0.806254,-1.030908,-1.167125,-1.310978,-1.435481,-1.496398,-1.453037,-1.294899
2,1,7.264587,7.034055,6.629022,5.817077,5.223535,4.317549,3.81634,3.151648,1.821924,...,0.203756,-0.3897,-0.360916,-0.46085,-0.6497,-1.439267,-0.761849,-1.337966,-0.262689,0.313171
3,1,-1.936512,-1.939833,-1.987513,-1.961937,-1.73246,-1.831799,-1.848352,-1.734995,-1.641283,...,0.46735,0.393055,0.393055,0.331662,0.182108,0.209839,0.208571,-0.148989,-0.495625,-0.388782
4,1,-0.089813,0.043997,0.081954,0.097712,0.042585,-0.00215,0.128609,-0.121613,0.105225,...,-0.08021,-0.215658,-0.01966,0.01406,-0.056996,-0.348337,-0.326422,-0.216392,-0.257964,0.032926


To ballance the data set we create a batch function to return equal number of exoplanet and non-exoplanet light curves. There will be duplicate light curves.

In [37]:
def light_curve_batch(train_df, batch_size=32):
    """
    Returns equal number of random exoplanets and non-exoplanet detections.
    """
    exo_df = train_df[train_df.LABEL == 1].sample(n=batch_size//2, 
        random_state=123, replace=True)
    nonexo_df = train_df[train_df.LABEL == 0].sample(n=batch_size//2,
        random_state=123, replace=True)
    df = pd.concat([exo_df, nonexo_df])
    df = df.sample(frac=1).reset_index(drop=True)
    light_curves = df.iloc[:, 1:]
    labels = df.iloc[:, 0]
    return [light_curves.values], labels.values

In [38]:
model1 = tensorflow.keras.models.Sequential()
model1.add(Conv1D(filters=8, kernel_size=11, activation='relu', input_shape=(train_data.shape[1], 1)))
model1.add(MaxPooling1D(strides=4))
model1.add(Flatten())
model1.add(Dense(64, activation='relu'))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

In [39]:
model1.compile(optimizer=Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])

hist = model1.fit(light_curve_batch(train_data, 32), 
                           validation_data=(test_data.iloc[:, 1:].values, test_data.iloc[:, 0].values), 
                           verbose=0, epochs=5,
                           steps_per_epoch=train_data.shape[1]//32)

ValueError: Error when checking input: expected conv1d_8_input to have shape (3198, 1) but got array with shape (32, 3197)

In [30]:
train_data.shape

(5087, 3198)