### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math
import time
from pathlib import Path

from scipy.stats import randint
from scipy import ndimage

from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold,RandomizedSearchCV
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

### Load datasets

In [2]:
'''
Original Dataset: 
https://www.kaggle.com/datasets/keplersmachines/kepler-labelled-time-series-data
''' 
train_dataset_path  = "./exoTrain.csv"
test_dataset_path = "./exoTest.csv"

X_train = pd.read_csv(train_dataset_path, encoding = "ISO-8859-1")
X_test = pd.read_csv(test_dataset_path, encoding = "ISO-8859-1")



# Remove rows with missing target, separate target from predictors
X_train.dropna(axis=0, subset=['LABEL'], inplace=True)
y_train = X_train.LABEL
y_test = X_test.LABEL

X_train.drop(['LABEL'], axis=1, inplace=True)
X_test.drop(['LABEL'], axis=1, inplace=True)

# Store original column names
original_columns = X_train.columns

print("y_train.shape: ", y_train.shape)
print("y_test.shape: ", y_test.shape)

X_train.head()


y_train.shape:  (5087,)
y_test.shape:  (570,)


Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,-160.17,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,-73.38,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,484.39,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,323.33,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,-970.88,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


1. Number of rows = number of stars 
2. Each star has a binary label (represented by column LABEL) of 2 or 1. '2' indicate that that the star is confirmed to have at least one exoplanet in orbit.
3. Remaining columns (FLUX.1 and so on) represent the recorded flux at given time.

## Preprocess Data

In [19]:
def preprocess_data(X_train,X_test,original_columns):
    
    # Preprocessing for numerical data
    imputer= SimpleImputer(strategy='median')
    imputer.fit(X_train)

    X_train_imputed= pd.DataFrame(imputer.transform(X_train))
    X_test_imputed = pd.DataFrame(imputer.transform(X_test))

    # Assign original column names back to the DataFrames
    X_train_imputed.columns = original_columns
    X_test_imputed.columns = original_columns

    print("X_train shape after imputation: ", X_train_imputed.shape)
    print("X_test.shape after imputation: ", X_test_imputed.shape)

    return X_train_imputed,X_test_imputed

In [4]:
X_train,X_test= preprocess_data(X_train,X_test,original_columns)

X_train shape after imputation:  (5087, 3197)
X_test.shape imputation:  (570, 3197)


## Data Processor

In [5]:
class ExoPlanetProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(fft(X, n=X.size))

    def process(self, df_train_x, df_valid_x):
        
        # Normalize
        if self.normalize:
            print("Normalizing Dataset")
            df_train_x = pd.DataFrame(normalize(df_train_x))
            df_valid_x = pd.DataFrame(normalize(df_valid_x))

        # Gaussian filter to smooth out data
        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.gaussian_filter(df_train_x, sigma=10)
            df_valid_x = ndimage.gaussian_filter(df_valid_x, sigma=10)

        if self.standardize:
            # Standardize X data
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_valid_x = std_scaler.transform(df_valid_x)

        df_train_x= pd.DataFrame(df_train_x)
        df_valid_x= pd.DataFrame(df_valid_x)
        return df_train_x, df_valid_x


In [6]:
# Process dataset
EPP = ExoPlanetProcessor(
    fourier=False,
    normalize=True,
    gaussian=True,
    standardize=True)

X_train, X_test = EPP.process(X_train, X_test)

# Assign original column names back to the DataFrames
X_train.columns = original_columns
X_test.columns = original_columns


print("Size after processing")
print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)

X_train.head()

Normalizing Dataset
Applying Gaussian Filter...
Standardizing...
Size after processing
X_train.shape:  (5087, 3197)
X_test.shape:  (570, 3197)


Unnamed: 0,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,FLUX.10,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,0.156857,0.157092,0.157533,0.158134,0.158812,0.15946,0.159918,0.160006,0.159474,0.158029,...,-0.804519,-0.753474,-0.706362,-0.663767,-0.626225,-0.594201,-0.568086,-0.548237,-0.534874,-0.528142
1,0.146127,0.146319,0.146673,0.147148,0.147664,0.148116,0.148352,0.148197,0.147412,0.145711,...,-0.80065,-0.749602,-0.702473,-0.659852,-0.622281,-0.590226,-0.564082,-0.544211,-0.530831,-0.52409
2,0.125147,0.12525,0.125431,0.125649,0.125833,0.125888,0.125671,0.125022,0.123719,0.121493,...,-0.793121,-0.74207,-0.694912,-0.652243,-0.614614,-0.582501,-0.556304,-0.536387,-0.522977,-0.516219
3,0.094507,0.094476,0.094392,0.094222,0.093902,0.093352,0.092445,0.091043,0.088948,0.085919,...,-0.782423,-0.731373,-0.684174,-0.641437,-0.603726,-0.571527,-0.54525,-0.525269,-0.511812,-0.50503
4,0.055045,0.054833,0.05439,0.053692,0.05269,0.051318,0.049474,0.047047,0.043872,0.039745,...,-0.769246,-0.718204,-0.670956,-0.628134,-0.590318,-0.55801,-0.531631,-0.511565,-0.498048,-0.491235


# 1. RandomForestClassifier

In [20]:
def rfc(X_train,y_train):
    # Define model
    model_rfc = RandomForestClassifier(n_estimators=30, random_state=0)


    model_rfc.fit(X_train, y_train)
    y_pred_rfc = model_rfc.predict(X_test)

    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=1)
    # evaluate model
    scores = cross_val_score(model_rfc, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
    # summarize performance
    print('Mean Accuracy: %.3f' % np.mean(scores))
    
    accuracy = accuracy_score(y_test, y_pred_rfc)
    precision = precision_score(y_test, y_pred_rfc)
    recall = recall_score(y_test, y_pred_rfc)

    print(f"Accuracy using RandomForestClassifier: %.3f"%accuracy)
    print(f"Precision using RandomForestClassifier: %.3f"%precision)
    print("Recall using RandomForestClassifier: %.3f" %recall)

In [21]:
rfc(X_train,y_train)

Mean Accuracy: 0.999
Accuracy using RandomForestClassifier: 0.991
Precision using RandomForestClassifier: 0.991
Recall using RandomForestClassifier: 1.000


## 1.1 Random Forest With Bootstrap Class Weighting



In [22]:
def rfc_with_bootstrap(X_train,y_train):
    model_rfc_bcw = RandomForestClassifier(n_estimators=30,random_state=0,class_weight='balanced_subsample')
    model_rfc_bcw.fit(X_train, y_train)
    y_pred_rfc_bcw = model_rfc_bcw.predict(X_test)
    
    accuracy_rfc_bcw = accuracy_score(y_test, y_pred_rfc_bcw)
    precision_rfc_bcw = precision_score(y_test, y_pred_rfc_bcw)
    recall_rfc_bcw = recall_score(y_test, y_pred_rfc_bcw)

    print(f"Accuracy using Bootstrap Class Weighting: %.3f"%accuracy_rfc_bcw)
    print(f"Precision using Bootstrap Class Weighting: %.3f"%precision_rfc_bcw)
    print("Recall using Bootstrap Class Weighting: %.3f" %recall_rfc_bcw)

In [23]:
rfc_with_bootstrap(X_train,y_train)

Accuracy using Bootstrap Class Weighting: 0.991
Precision using Bootstrap Class Weighting: 0.991
Recall using Bootstrap Class Weighting: 1.000


# 2. Using Bagging Classifier

We are dealing with imbalanced classification problem so can't use usual ML models without accounting for dataset class imbalance. Here I use bagging classifier as first approach to deal with thhat problem.

https://machinelearningmastery.com/what-is-imbalanced-classification/

1. Bagging classifier is an ensemble classifier which is created using multiple estimators which can be trained using different sampling techniques including bagging or bootstrap aggregation (samples drawn with replacement). 

2. Bagging classifier helps reduce the variance of individual estimators by sampling technique and combining the predictions.


In [26]:
def bagging_classifier(X_train, y_train):
    
    # deafault base estimator is a DecisionTreeClassifier.

    bagging = BaggingClassifier(n_estimators=10, random_state=0)
    bagging.fit(X_train, y_train)

    y_pred_bagging = bagging.predict(X_test)

    accuracy_bag = accuracy_score(y_test, y_pred_bagging)
    precision_bag = precision_score(y_test, y_pred_bagging)
    recall_bag = recall_score(y_test, y_pred_bagging)
    
    print(f"Accuracy using Bagging Classifier: %.3f"%accuracy_bag)
    print(f"Precision using Bagging Classifier: %.3f"%precision_bag)
    print(f"Recall using Bagging Classifier: %.3f"%recall_bag)

    # Model scores on test and training data
    print('Model training Score: %.3f' %bagging.score(X_train, y_train))
    print('Model test Score: %.3f ' %bagging.score(X_test, y_test))

In [27]:
bagging_classifier(X_train, y_train)

Accuracy using Bagging Classifier: 0.991
Precision using Bagging Classifier: 0.991
Recall using Bagging Classifier: 1.000
Model training Score: 1.000
Model test Score: 0.991 


# 2.1 Bagging with Random Forest Classifier as Base Estimator


In [28]:
def bagging_classifier_rgb(X_train, y_train):
    rfc_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(),n_estimators=10, random_state=0)
    rfc_bagging.fit(X_train, y_train)

    y_pred_rfc_bagging = rfc_bagging.predict(X_test)

    accuracy_bag_rfc = accuracy_score(y_test, y_pred_rfc)
    precision_bag_rfc = precision_score(y_test, y_pred_rfc)
    recall_bag_rfc= recall_score(y_test, y_pred_rfc)
    print(f"Accuracy using Bagging Classifier with RFC: %.3f"%accuracy_bag_rfc)
    print(f"Precision using Bagging Classifier with RFC: %.3f"%precision_bag_rfc)
    print(f"Recall using Bagging Classifier with RFC: %.3f"%recall_bag_rfc)

    # Model scores on test and training data
    print('Model training Score: %.3f' %rfc_bagging.score(X_train, y_train))
    print('Model test Score: %.3f ' %rfc_bagging.score(X_test, y_test))


In [29]:
bagging_classifier_rgb(X_train, y_train)

Accuracy using Bagging Classifier with RFC: 0.991
Precision using Bagging Classifier with RFC: 0.991
Recall using Bagging Classifier with RFC: 1.000
Model training Score: 1.000
Model test Score: 0.991 


# 2.2 Bagging with Support Vector Classifier as Base Estimator

In [32]:
def bagging_classifier_svc(X_train, y_train):
    
    #First just fit data using classic SVC for test purpose 
    model_svc = svm.SVC()

    model_svc.fit(X_train, y_train)

    y_pred_svc = model_svc.predict(X_test)

    accuracy_svc = accuracy_score(y_test, y_pred_svc)
    precision_svc = precision_score(y_test, y_pred_svc)
    recall_svc = recall_score(y_test, y_pred_svc)

    print(f"Accuracy using SVC: %.3f"%accuracy_svc)
    print(f"Precision using SVC: %.3f"%precision_svc)
    print(f"Recall using SVC: %.3f"%precision_svc)
    
    # Now using bagging classfier
    svc_bagging = BaggingClassifier(base_estimator=svm.SVC(),n_estimators=10, random_state=0)
    svc_bagging.fit(X_train, y_train)

    y_pred_svc_bagging = svc_bagging.predict(X_test)

    accuracy_bag_svc = accuracy_score(y_test, y_pred_svc_bagging)
    precision_bag_svc = precision_score(y_test, y_pred_svc_bagging)
    recall_bag_svc = recall_score(y_test, y_pred_svc_bagging)

    print(f"Accuracy using Bagging Classifier with SVC: %.3f"%accuracy)
    print(f"Precision using Bagging Classifier with SVC: %.3f"%precision)
    print(f"Recall using Bagging Classifier with SVC: %.3f"%precision)
    
    print('Model training Score: %.3f' %bagging.score(X_train, y_train))
    print('Model test Score: %.3f ' %bagging.score(X_test, y_test))

In [34]:
bagging_classifier_svc(X_train, y_train)

Accuracy using SVC: 0.991
Precision using SVC: 0.991
Recall using SVC: 0.991
Accuracy using Bagging Classifier with SVC: 0.991
Precision using Bagging Classifier with SVC: 0.991
Recall using Bagging Classifier with SVC: 0.991
Model training Score: 1.000
Model test Score: 0.991 
