In [49]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Training

In [50]:
train_data = pd.read_csv('data/exoTrain.csv')

In [78]:
train_data.head(3)

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67


We select only the first 5000 stars

In [53]:
train_data = train_data.iloc[:5000,:]

In [54]:
X = np.array(train_data.drop('LABEL',axis = 1))
y = np.array(train_data[['LABEL']]).reshape(-1,)

We create a class for testing different ML models

In [67]:
class Classifier:
    def __init__(self, model):
        self.model = model
        self.scaler = StandardScaler()

    def train(self,X,y):
        self.scaler.fit(X)
        X = self.scaler.transform(X)
        self.model.fit(X, y)
        return self.model.score(X, y)

    def predict_rescaled(self, X):
        X = self.scaler.transform(X)
        return self.model.predict(X)

    def good_detection_score(self, X, y, target = 2):
        res = self.predict_rescaled(X)
        correct_guesses = 0.0
        total = 0.0
        for i in range(len(res)):
            if y[i] == target:
                total += 1
                if res[i] == target:
                    correct_guesses += 1
        print('correct guesses : ' + str(correct_guesses))
        print('total : ' + str(total))   
        print('score : '+str(correct_guesses/total))

    def fake_detection_score(self, X, y, target = 2):
        res = self.predict_rescaled(X)
        fake_guesses = 0.0
        total = 0.0
        for i in range(len(res)):
            if y[i] != target:
                total += 1
                if res[i] == target:
                    fake_guesses += 1
        print('fake guesses : ' + str(fake_guesses))
        print('total : ' + str(total))
        print('score : '+str(fake_guesses/total))

            

Training with the train data set

In [68]:
model1 = Classifier(RandomForestClassifier())
model2 = Classifier(SupportVectorClassifier())
score1 = model1.train(X,y)
score2 = model2.train(X,y)

Good detection score

In [71]:
model1.good_detection_score(X,y)
model2.good_detection_score(X,y)

correct guesses : 37.0
total : 37.0
score : 1.0
correct guesses : 1.0
total : 37.0
score : 0.02702702702702703


Fake detection score

In [70]:
model1.fake_detection_score(X,y)
model2.fake_detection_score(X,y)

fake guesses : 0.0
total : 4963.0
score : 0.0
fake guesses : 0.0
total : 4963.0
score : 0.0


# Testing

In [72]:
test_data = pd.read_csv('data/exoTest.csv')

In [80]:
test_data.head(3)

Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,119.88,100.21,86.46,48.68,46.12,39.39,18.57,6.98,6.63,...,14.52,19.29,14.44,-1.62,13.33,45.5,31.93,35.78,269.43,57.72
1,2,5736.59,5699.98,5717.16,5692.73,5663.83,5631.16,5626.39,5569.47,5550.44,...,-581.91,-984.09,-1230.89,-1600.45,-1824.53,-2061.17,-2265.98,-2366.19,-2294.86,-2034.72
2,2,844.48,817.49,770.07,675.01,605.52,499.45,440.77,362.95,207.27,...,17.82,-51.66,-48.29,-59.99,-82.1,-174.54,-95.23,-162.68,-36.79,30.63


In [75]:
X_test = np.array(test_data.drop('LABEL',axis = 1))
y_test = np.array(test_data[['LABEL']]).reshape(-1,)

In [76]:
model1.good_detection_score(X_test,y_test)
model1.fake_detection_score(X_test,y_test)

correct guesses : 0.0
total : 5.0
score : 0.0
fake guesses : 0.0
total : 565.0
score : 0.0


In [77]:
model2.good_detection_score(X_test,y_test)
model2.fake_detection_score(X_test,y_test)

correct guesses : 0.0
total : 5.0
score : 0.0
fake guesses : 0.0
total : 565.0
score : 0.0
