In [9]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.spatial.distance import cdist
from scipy import stats

#To get better visual of the confusion matrix:
def plot_confusion_matrix(cm, classes,
             normalize=False,
             title='Confusion matrix',
             cmap=plt.cm.Blues):
    #Add Normalization Option
    '''prints pretty confusion metric with normalization option '''
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix\\\\")
    #else:
        #print('Confusion matrix, without normalization\\\\')
    
#     print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#carichiamo i dati 
df = pd.read_csv("exoplanet.csv")
#df.info()
df = df.replace('CONFIRMED', 0)
df = df.replace('CANDIDATE', 1)
df = df.replace('FALSE POSITIVE', 2)
df = df.fillna(0)

df.sort_values('koi_disposition', inplace=True, ascending=False)
df = df.iloc[2500:]

shuffled = df.sample(frac=1, random_state=42)

shuffled.to_csv("Exoplanet_mod.csv")

In [10]:
#normalize feature with MinMaxScaler after it we need to fit the data
MinMaxScaler = preprocessing.MinMaxScaler()

y_data = pd.read_csv('Exoplanet_mod.csv', usecols= ['koi_disposition'])#.values.transpose()[0]
x_data = pd.read_csv('Exoplanet_mod.csv', usecols= ['koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co','koi_fpflag_ec',
                                                    'koi_period','koi_period_err1','koi_period_err2','koi_time0bk','koi_time0bk_err1',
                                                    'koi_time0bk_err2','koi_impact','koi_impact_err1','koi_impact_err2','koi_duration',
                                                    'koi_duration_err1','koi_duration_err2','koi_depth','koi_depth_err1','koi_depth_err2',
                                                    'koi_prad','koi_prad_err1','koi_prad_err2','koi_teq','koi_teq_err1','koi_teq_err2',
                                                    'koi_insol','koi_insol_err1','koi_insol_err2','koi_model_snr','koi_tce_plnt_num',
                                                    'koi_steff','koi_steff_err1','koi_steff_err2','koi_slogg','koi_slogg_err1',
                                                    'koi_slogg_err2','koi_srad','koi_srad_err1','koi_srad_err2','ra','dec','koi_kepmag'])

#take the 70% of data for train and 30% for test
x_train = x_data[:5000]#5651
y_train = y_data[:5000]

x_test = x_data[5000:]
y_test = y_data[5000:]

x_train = x_train.values
x_test = x_test.values

print("Classes for training: ",y_train.shape)
print("Feature for training: ",x_train.shape)

print("Classes for training: ",y_test.shape)
print("Feature for training: ",x_test.shape)

x_train = MinMaxScaler.fit_transform(x_train)
x_test = MinMaxScaler.fit_transform(x_test)

Classes for training:  (5000, 1)
Feature for training:  (5000, 42)
Classes for training:  (2064, 1)
Feature for training:  (2064, 42)


In [11]:
#Calculate the mean of each feature and center the data
m = np.mean(x_train, axis=0)
Xc = x_train - m

m1 = np.mean(x_test, axis=0)
Xc_t = x_test - m1

# Calculate the covariance matrix of the centred data
C = np.cov(Xc, rowvar=False)
C1 = np.cov(Xc_t, rowvar=False)

# Calculate eigenvalues and eigenvectors of the covariance matrix
lambdas, U = np.linalg.eigh(C)
lambdas1, U1 = np.linalg.eigh(C1)

# Order the eigenvalues from largest to smallest

best_eig_idxs = np.argsort(lambdas)[::-1]
best_eig = lambdas[best_eig_idxs]
best_U = U[:, best_eig_idxs]

best_eig_idxs1 = np.argsort(lambdas1)[::-1]
best_eig1 = lambdas1[best_eig_idxs1]
best_U1 = U1[:, best_eig_idxs1]

# I construct the transformation matrix T: 
T = best_U[:, :4]
T1 = best_U1[:, :4]

# Apply the transformation to the train data, make a scatter plot of the transformed data
# The transformation is performed via the inner product of the transformation matrix T
# and the (centered) data matrix
XT_train = np.dot(Xc, T)
XT_test = np.dot(Xc_t, T1)


In [12]:
#definition of kernels

def gamma(x, ktype):
  if ktype == 'rect':
        return 1 if abs(x[0]) <= 0.5 and abs(x[1]) <= 0.5 and abs(x[2]) <= 0.5 and abs(x[3]) <= 0.5 else 0
  elif ktype == 'gaussian':
        return ((2*np.pi)**(-1/2)) * np.exp(-((x[0]+x[1]+x[2]+x[3])**2/2))
  elif ktype == 'exponential':
        return 1/2 * np.exp(-abs(x[0]+x[1]+x[2]+x[3]))
  else:
        raise ValueError('Kernel type not recognized. Possible options are: "rect", "gaussian", "exponential".')

#Set hyperparameters
hs = [0.01,0.1,1]

#from dataframe to numpy array
temp = y_train.to_numpy()

#counting number of elements of clusses for storage purpose
bin1 = np.count_nonzero(temp == 1)
bin0 = np.count_nonzero(temp == 0)
bin2 = np.count_nonzero(temp == 2)

#array for classes storage
c1 = np.zeros((bin1, 4))
c2 = np.zeros((bin2, 4))
c0 = np.zeros((bin0, 4))

f = len(temp)
j,m,n = 0,0,0

#dividing dataset with respect the 3 classes

for i in range(f):
    if temp[i]==0 and j < bin0:
        c0[j] = XT_train[i]
        j+=1
    elif temp[i]==2 and m < bin2:
        c2[m] = XT_train[i]
        m = m+1
    elif temp[i]==1 and n < bin1:
        c1[n] = XT_train[i]
        n=n+1
    else:
        pass

kernels = ["rect", "gaussian", "exponential"]

# Estimate the likelihood through the Parzen Windows method
# For each test item, calculate the likelihood for each of the 3 classes. Append to predicted the class with the higher likelihood

for kernel in kernels:
    for h in hs:
        predicted = []
        for x_te in XT_test:
            lik1, lik2, lik3= [], [], []
            for x_tr in c1: 
                lik1.append(gamma((np.subtract(x_te, x_tr)/h), kernel)) # Calculate Parzen window likelihood for C1 based on the kernel
            for x_tr in c2:
                lik2.append(gamma((np.subtract(x_te, x_tr)/h), kernel)) # Calculate Parzen window likelihood for C2 based on the kernel
            for x_tr in c0:
                lik3.append(gamma((np.subtract(x_te, x_tr)/h), kernel)) # Calculate Parzen window likelihood for C0 based on the kernel
            
            l1, l2, l3 = 1/h * np.mean(lik1), 1/h * np.mean(lik2), 1/h * np.mean(lik3), # Calculate final probability of belonging to either class
            predicted.append(np.argmax([l1,l2,l3])) # Assign the class with higher probability

        y_t = y_test.values.tolist()

        g = len(y_t)
        # Calculate accuracy
        accuracy = 0
        for i in range(g):
            if predicted[i] == list(y_t)[i]:
                accuracy+=1
        accuracy = accuracy/n
        print(f'Accuracy with h={h} and kernel function "{kernel}": {accuracy*100}%')
    print("\n")

Accuracy with h=0.01 and kernel function "rect": 42.43002544529262%
Accuracy with h=0.1 and kernel function "rect": 21.119592875318066%
Accuracy with h=1 and kernel function "rect": 27.035623409669213%


Accuracy with h=0.01 and kernel function "gaussian": 27.544529262086513%
Accuracy with h=0.1 and kernel function "gaussian": 34.223918575063614%
Accuracy with h=1 and kernel function "gaussian": 42.30279898218829%


Accuracy with h=0.01 and kernel function "exponential": 32.31552162849873%
Accuracy with h=0.1 and kernel function "exponential": 34.478371501272264%
Accuracy with h=1 and kernel function "exponential": 32.12468193384224%


