### Importing libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math
import time
from pathlib import Path

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from scipy import ndimage

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

### Load datasets

In [3]:
'''
Original Dataset: 
https://www.kaggle.com/datasets/keplersmachines/kepler-labelled-time-series-data
''' 
train_dataset_path  = "./exoTrain.csv"
test_dataset_path = "./exoTest.csv"

X_full = pd.read_csv(train_dataset_path, encoding = "ISO-8859-1")
X_test_full = pd.read_csv(test_dataset_path, encoding = "ISO-8859-1")

X_full.head()

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,2,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,2,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


1. Number of rows = number of stars 
2. Each star has a binary label (represented by column LABEL) of 2 or 1. '2' indicate that that the star is confirmed to have at least one exoplanet in orbit.
3. Remaining columns (FLUX.1 and so on) represent the recorded flux at given time.

In [4]:
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['LABEL'], inplace=True)
y = X_full.LABEL
X_full.drop(['LABEL'], axis=1, inplace=True)


# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)
X_train_full.head()

Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
195,-0.24,2.32,0.68,-6.14,2.29,1.7,-11.51,-6.86,0.0,-6.71,...,6.51,9.97,6.07,12.36,3.64,14.68,2.44,18.76,10.61,4.84
514,-0.71,-5.74,6.91,-15.88,-3.77,-2.17,-2.75,-2.85,4.11,5.25,...,-3.7,-12.51,-13.14,-7.66,1.84,8.72,18.24,22.17,28.28,33.19
4010,357.43,318.59,313.65,283.61,291.92,278.99,265.67,257.13,234.81,204.4,...,7.97,-6.12,-22.03,-80.83,-87.85,-92.64,87.22,87.86,74.92,42.62
1011,143.28,153.5,127.39,112.78,105.78,72.22,79.76,23.47,29.87,20.41,...,-7.5,11.83,56.41,129.12,174.59,-35.49,-35.49,-0.5,29.55,84.0
3955,-13.12,-17.54,-18.51,-8.13,-16.58,-10.05,-6.82,-19.21,-14.31,-6.18,...,-1.74,-1.8,0.68,11.38,-6.46,-2.53,1.24,3.69,2.38,-4.67


## Preprocess Data

In [5]:
# Preprocessing for numerical data
imputer= SimpleImputer(strategy='median')
imputer.fit(X_train_full)

X_train_full= pd.DataFrame(imputer.transform(X_train_full))
X_valid_full= pd.DataFrame(imputer.transform(X_valid_full))

print("X_train.shape: ", X_train_full.shape)
print("X_valid.shape: ", X_valid_full.shape)


X_train.shape:  (4069, 3197)
X_valid.shape:  (1018, 3197)


## Data Processor

In [10]:
class LightFluxProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(fft(X, n=X.size))

    def process(self, df_train_x, df_valid_x):
        
        # Normalize
        if self.normalize:
            print("Normalizing Dataset")
            df_train_x = pd.DataFrame(normalize(df_train_x))
            df_valid_x = pd.DataFrame(normalize(df_valid_x))

        # Gaussian filter to smooth out data
        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.gaussian_filter(df_train_x, sigma=10)
            df_valid_x = ndimage.gaussian_filter(df_valid_x, sigma=10)

        if self.standardize:
            # Standardize X data
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_valid_x = std_scaler.transform(df_valid_x)

        return df_train_x, df_valid_x


In [11]:
# Process dataset
LFP = LightFluxProcessor(
    fourier=False,
    normalize=True,
    gaussian=True,
    standardize=True)
X_train, X_valid = LFP.process(X_train_full, X_valid_full)
print("Size after processing")
print("X_train.shape: ", X_train.shape)
print("X_valid.shape: ", X_valid.shape)

Normalizing Dataset
Applying Gaussian Filter...
Standardizing...
Size after processing
X_train.shape:  (4069, 3197)
X_valid.shape:  (1018, 3197)


## Build Model, Train, and Predict

In [12]:
# Define model
model = RandomForestClassifier(n_estimators=100, random_state=0)


model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

## Calculate and Display Metrics

In [13]:
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.9921414538310412
Precision: 0.9921414538310412
Recall: 1.0


## Hyperparamter Tuning

We use Scikit-Learn’s RandomizedSearchCV, which will randomly search parameters within a range per hyperparameter.

In [None]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

# Create a random forest classifier
model = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(model, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

RandomizedSearchCV will train many models (defined by n_iter_ and save each one as variables, the code below creates a variable for the best model and prints the hyperparameters.

In [None]:
# Create a variable for the best model
best_model = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

# Generate predictions with the best model
y_pred = best_model.predict(X_valid)
np.shape(y_pred)

In [None]:
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)