In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import torch
import torch.nn as nn
from torch.autograd import Variable
from tqdm import trange

# Data processing

In [None]:
train_data = pd.read_csv('data/exoTrain.csv')

In [None]:
train_data.head(3)

# Training with classical classifier

We select only the first 5000 stars

In [None]:
train_data = train_data.iloc[:5000,:]

In [None]:
X = np.array(train_data.drop('LABEL',axis = 1))
y = np.array(train_data[['LABEL']]).reshape(-1,)

We create a class for testing different ML models

In [None]:
class Classifier:
    def __init__(self, model):
        self.model = model
        self.scaler = StandardScaler()

    def train(self,X,y):
        self.scaler.fit(X)
        X = self.scaler.transform(X)
        self.model.fit(X, y)
        return self.model.score(X, y)

    def predict_rescaled(self, X):
        X = self.scaler.transform(X)
        return self.model.predict(X)

    def good_detection_score(self, X, y, target = 2):
        res = self.predict_rescaled(X)
        correct_guesses = 0.0
        total = 0.0
        for i in range(len(res)):
            if y[i] == target:
                total += 1
                if res[i] == target:
                    correct_guesses += 1
        print('correct guesses : ' + str(correct_guesses))
        print('total : ' + str(total))   
        print('score : '+str(correct_guesses/total))

    def fake_detection_score(self, X, y, target = 2):
        res = self.predict_rescaled(X)
        fake_guesses = 0.0
        total = 0.0
        for i in range(len(res)):
            if y[i] != target:
                total += 1
                if res[i] == target:
                    fake_guesses += 1
        print('fake guesses : ' + str(fake_guesses))
        print('total : ' + str(total))
        print('score : '+str(fake_guesses/total))

            

Training with the train data set

In [None]:
model1 = Classifier(RandomForestClassifier())
model2 = Classifier(SVC())
score1 = model1.train(X,y)
score2 = model2.train(X,y)

Good detection score

In [None]:
model1.good_detection_score(X,y)
model2.good_detection_score(X,y)

Fake detection score

In [None]:
model1.fake_detection_score(X,y)
model2.fake_detection_score(X,y)

# Testing with classical classifier

In [None]:
test_data = pd.read_csv('data/exoTest.csv')
X_test = np.array(test_data.drop('LABEL',axis = 1))
y_test = np.array(test_data[['LABEL']]).reshape(-1,)

In [None]:
model1.good_detection_score(X_test,y_test)
model1.fake_detection_score(X_test,y_test)

In [None]:
model2.good_detection_score(X_test,y_test)
model2.fake_detection_score(X_test,y_test)

# A convolutional neural network

In [None]:
class CNN_nn(nn.Module):
    def __init__(self,input_size, kernel_number, sequence_length, kernel_size, stride = 1, average_size = 20, output_size = 1):
        super(CNN_nn, self).__init__()
        #Attributes
        self.input_size = input_size #input size
        self.output_size = output_size #output size
        self.kernel_number = kernel_number #number of kernels
        self.kernel_size = kernel_size
        self.stride = stride
        self.average_size = average_size
        self.sequence_length = sequence_length-self.average_size+1 #length of the 1d input sequence
        self.scaler = StandardScaler()
        self.scaler_target = MinMaxScaler()

        self.cnn1 = nn.Conv1d(self.input_size,self.kernel_number,self.kernel_size,self.stride) #Conv 1d
        self.max_pool1 = nn.MaxPool1d(self.kernel_size) #Max pooling
        self.cnn2 = nn.Conv1d(self.kernel_number,self.kernel_number,self.kernel_size,self.stride) #Conv 1d
        self.max_pool2 = nn.MaxPool1d(self.kernel_size) #Max pooling

        example = self.cnn1(torch.randn(1,self.input_size,self.sequence_length)) # one batch to get the output length dimension
        example = self.max_pool1(example)
        example = self.cnn2(example)
        example = self.max_pool2(example)
        self.Lout = example.size(2) #Length of the output sequence

        self.fc = nn.Linear(self.kernel_number*self.Lout, self.output_size) #fully connected linear

        self.relu = nn.ReLU()
        self.sig = nn.Sigmoid()

    def smoothening(self,X):
        rows = X.shape[0]
        columns = X.shape[1]
        res = np.zeros((rows,columns-self.average_size+1))
        for j in range(rows):
            for i in range(columns-self.average_size+1):
                res[j,i] = sum(X[j,i:i+self.average_size])/self.average_size
        return res

    def process_features(self, X):
        #Standardisation
        X_torch = self.scaler.transform(X)

        #Smoothening with moving average
        X_torch = self.smoothening(X_torch)
        
        #Reshaping
        X_torch = Variable(torch.Tensor(X_torch))
        X_torch = X_torch.reshape(-1,1,self.sequence_length)
        return X_torch

    def process_target(self, y):
        y_torch = self.scaler_target.transform(y)
        y_torch = Variable(torch.Tensor(y_torch))
        return y_torch

    def forward(self,X):
        #Convolution layer
        out = self.cnn1(X)
        #Activation with relu
        out = self.relu(out)
        #Max pooling
        out = self.max_pool1(out)

        #Convolution layer
        out = self.cnn2(out)
        #Activation with relu
        out = self.relu(out)
        #Max pooling
        out = self.max_pool2(out)
        
        # Flatten the output for fully connected layer
        out = out.flatten(1,2)

        # Propagate input through fully connected linear neuron
        out = self.fc(out)

        # Activation with sigmoid
        out = self.sig(out)
        return out
    
    def forward_with_processing(self,X):
        X_torch = self.process_features(X)
        return self.forward(X_torch)
    
    def train(self, num_epochs, learning_rate, criterion, X_train, y_train, X_test, y_test):
        self.scaler.fit(X_train)
        self.scaler_target.fit(y_train)
        
        X_train = self.process_features(X_train)
        y_train = self.process_target(y_train)

        X_test = self.process_features(X_test)
        y_test = self.process_target(y_test)

        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) 
        t = trange(num_epochs+1)
        for epoch in t:
            #Pass through the neural network
            train_outputs = self.forward(X_train) 
            test_outputs = self.forward(X_test) 

            #Reset gradients to zero  
            optimizer.zero_grad() 

            train_loss = criterion(train_outputs, y_train)
            test_loss = criterion(test_outputs,y_test)

            #Backprogagation step
            train_loss.backward()

            #Update weights and bias of the network
            optimizer.step()

            #Print train and test loss
            t.set_description("Epoch: %d, Train loss: %1.5f, Test loss: %1.5f" % (epoch, train_loss.item(),test_loss.item()))

    def good_detection_score(self, X, y, target = 1.0, epsilon = 0.01):
            y_predicted = self.forward_with_processing(X)
            y_ref = self.process_target(y)
            correct_guesses = 0.0
            total = 0.0
            for i in range(len(y_predicted)):
                if abs(y_ref[i]-target) < epsilon:
                    total += 1
                    if abs(y_predicted[i]-target) < epsilon:
                        correct_guesses += 1
            print('correct guesses : ' + str(correct_guesses))
            print('total : ' + str(total))   
            print('score : '+str(correct_guesses/total))

    def fake_detection_score(self, X, y, target = 1.0, epsilon = 0.01):
        y_predicted = self.forward_with_processing(X)
        y_ref = self.process_target(y)
        fake_guesses = 0.0
        total = 0.0
        for i in range(len(y_predicted)):
            if abs(y_ref[i]-target) > epsilon:
                total += 1
                if abs(y_predicted[i]-target) < epsilon:
                    fake_guesses += 1
        print('fake guesses : ' + str(fake_guesses))
        print('total : ' + str(total))
        print('score : '+str(fake_guesses/total))

    def fourier_transform(self,X):
        FT = np.fft.fft(X,axis = 1)
        FT_norm = abs(FT)
        return FT_norm

    def plot_features(self, X, cut_low= 100, cut_high = 100):
        X_torch = X.reshape(1,-1)
        X_torch = self.smoothening(X_torch)
        FFT_norm = self.fourier_transform(X_torch)

        plt.clf()
        plot1 = plt.figure(1)
        plt.plot(X)

        plot2 = plt.figure(2)
        plt.plot(X_torch[0])

        plot3 = plt.figure(3)
        plt.plot(FFT_norm[0,cut_low:len(FFT_norm)-cut_high-1])
        plt.show()


In [None]:
train_data = pd.read_csv('data/exoTrain.csv')
sequence_length = 600
number_of_stars = 100
train_data = train_data.iloc[:number_of_stars,:sequence_length+1]
X_train = np.array(train_data.drop('LABEL',axis = 1))
y_train = np.array(train_data[['LABEL']]).reshape(-1,1)

In [None]:
test_data = pd.read_csv('data/exoTest.csv')
number_of_stars = 2000
test_data = test_data.iloc[:number_of_stars,:sequence_length+1]
X_test = np.array(test_data.drop('LABEL',axis = 1))
y_test = np.array(test_data[['LABEL']]).reshape(-1,1)

# CNN training and testing

In [None]:
model3 = CNN_nn(input_size=1, kernel_number = 16, sequence_length=sequence_length, kernel_size=8)

In [None]:
num_epochs = 500
learning_rate = 0.002
criterion = nn.BCELoss()
model3.train(num_epochs, learning_rate, criterion, X_train, y_train, X_test, y_test)

In [None]:
print('Training')
model3.good_detection_score(X_train,y_train)
model3.fake_detection_score(X_train,y_train)

In [None]:
print('Testing')
model3.good_detection_score(X_test,y_test)
model3.fake_detection_score(X_test,y_test)

In [None]:
model3.plot_features(X_train[70], cut_low=100, cut_high=100)