In [1]:
# import torch
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from read_yse_ztf_snana_dir import read_YSE_ZTF_snana_dir

snid_list, meta_list, yse_ztf_fp_df_list = read_YSE_ZTF_snana_dir('spec_yse_dr1')
real_data = pd.DataFrame(pd.read_csv('Real_Data.csv'))

# extracting the IDs that have sufficient information 
truth_ids = list(real_data['ObjectID'])

columns = ['object_id', 'transient_spec_class']

specs = []
for dic in meta_list:
    spec = dic['transient_spec_class']
    specs.append(spec)

p = dict(zip(snid_list, specs))
ground_truth_dict = dict(zip(snid_list, specs))
ground_truth = pd.DataFrame(zip(snid_list, specs), columns=columns)
    
# extracting the spec labels corresponding to the truth ids
labels = []
for item in ground_truth_dict.keys():
    if item in truth_ids:
        labels.append(ground_truth_dict[item])

real_truths = zip(truth_ids, labels)
df_real_truths = pd.DataFrame(real_truths, columns=['id', 'class'])

# remove weird classes
# df_real_truths = df_real_truths.drop('SNIa-CSM', axis=0)
df_real_truths = df_real_truths[~df_real_truths['class'].isin(['SNIa-CSM', 'SNIIP', 'SNIIn', \
            'SNIa-91T-like', 'SNIa-SC', 'SNIax[02cx-like]', 'SLSN-II', 'SNIIb', 'SLSN-I', 'SNIb/c', 'SNIc-BL'])]

b = [x for x in range(df_real_truths.index[0], df_real_truths.index[-1] + 1)]
a = set(df_real_truths.index)
weird = list(df_real_truths.index ^ set(b))
print(np.array(weird).dtype, real_data.index.dtype)

# remove rows in real_data.csv that correspond to the removed weird classes
real_data_rem = real_data[~real_data.index.isin(weird)]

# remove the object ID column from real_data
df_real_data = real_data_rem.drop('ObjectID', axis=1)
labels = df_real_truths['class']

print(df_real_truths.shape, df_real_data.shape)

Reading YSE+ZTF SNANA-style data files from directory:  spec_yse_dr1
int64 int64
(247, 2) (247, 16)


In [2]:
'''SciKit Learn MLP Classifier on all dimensions'''
from sklearn.metrics import confusion_matrix

# hyperparameters
max_iter = 500
stopping = False
alpha = 0.0001
random_state = 42


X_train, X_test, y_train, y_test = train_test_split(df_real_data, labels, stratify=labels)

clf = MLPClassifier(random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)

# print(clf.predict_proba(X_test).shape)

pred_labels = clf.predict(X_test)

# print the confusion matrix (should be k by k, where k = 11)
conf = confusion_matrix(y_test, pred_labels, normalize='all')
# print(conf)

print('training accuracy, unaltered data: ', clf.score(X_train, y_train))
print('testing accuracy, unaltered data: ', clf.score(X_test, y_test))

training accuracy, unaltered data:  0.5837837837837838
testing accuracy, unaltered data:  0.46774193548387094


In [3]:
'''SciKit Learn MLP Classifier on reduced dimensions'''

from PPCA import applyppca
from sklearn.preprocessing import StandardScaler

fakes_df = pd.read_csv("./SNIa_sims.csv", index_col=0)

pca_num = 8

ss = StandardScaler()
smaller = ss.fit_transform(df_real_data)
real_small = pd.DataFrame(smaller)

ss = StandardScaler()
smaller = ss.fit_transform(fakes_df)
fake_small = pd.DataFrame(smaller)

W, var, pcs2_X_real, pcs2_X_fake, score2, coeff2 = applyppca(dataset_real=real_small, dataset_fakes=fake_small, pca_num=pca_num)

# must transpose output of applyppca
X_train, X_test, y_train, y_test = train_test_split(pcs2_X_real.T, labels, stratify=labels)

clf = MLPClassifier(random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)

pred_labels = clf.predict(X_test)

# print the confusion matrix (should be k by k, where k = 11)
conf = confusion_matrix(y_test, pred_labels)

# compare the predicted and test labels
compare = zip(np.array(y_test), pred_labels)

df_compare = pd.DataFrame(compare, columns=['truths', 'predicted'])
# print(df_compare)

print('training accuracy, ppca: ', clf.score(X_train, y_train))
print('testing accuracy, ppca: ', clf.score(X_test, y_test))



training accuracy, ppca:  0.8324324324324325
testing accuracy, ppca:  0.7903225806451613




In [4]:
'''SciKit Learn MLP test on original PCA'''

from PPCA import applypca

# apply the pca
pca, pcs_X_real, pcs_X_fakes, score, coeff = applypca(dataset_real = df_real_data, dataset_fakes=fakes_df, pca_num=pca_num)

X_train, X_test, y_train, y_test = train_test_split(pcs_X_real, labels, stratify=labels)

clf = MLPClassifier(random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)

pred_labels = clf.predict(X_test)

# print the confusion matrix (should be k by k, where k = 11)
conf = confusion_matrix(y_test, pred_labels)

# compare the predicted and test labels
compare = zip(np.array(y_test), pred_labels)

df_compare = pd.DataFrame(compare, columns=['truths', 'predicted'])
# print(df_compare)

print('training accuracy, original pca: ', clf.score(X_train, y_train))
print('testing accuracy, original pca: ', clf.score(X_test, y_test))

training accuracy, original pca:  0.8648648648648649
testing accuracy, original pca:  0.8064516129032258




In [5]:
'''Comparisons across different pca_nums and original, pca, ppca'''

# hyperparameters
max_iter = 500
stopping = False
alpha = 0.0001
random_state = 42

columns = ['no pca or ppca', 'pca', 'ppca']
rows = ['training accuracy', 'testing accuracy']

training_accuracies_no = [] # 2d array, columns are training and testing accuracy, rows are the pca_nums
training_accuracies_pca = []
training_accuracies_ppca = []

testing_accuracies_no = [] # 2d array, columns are training and testing accuracy, rows are the pca_nums
testing_accuracies_pca = []
testing_accuracies_ppca = []

pca_nums = [6,7,8,9,10,11,12,13]

for pca_num in pca_nums:
    
    # no application of either pca or ppca
    X_train, X_test, y_train, y_test = train_test_split(df_real_data, labels, stratify=labels)
    clf = MLPClassifier(random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)
    training_accuracies_no.append(clf.score(X_train, y_train))
    testing_accuracies_no.append(clf.score(X_test, y_test))
    
    # pca 
    W, var, pcs2_X_real, pcs2_X_fake, score2, coeff2 = applyppca(dataset_real=real_small, dataset_fakes=fake_small, pca_num=pca_num)
    X_train, X_test, y_train, y_test = train_test_split(pcs2_X_real.T, labels, stratify=labels)
    clf = MLPClassifier(random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)
    training_accuracies_pca.append(clf.score(X_train, y_train))
    testing_accuracies_pca.append(clf.score(X_test, y_test))
    
    # ppca
    pca, pcs_X_real, pcs_X_fakes, score, coeff = applypca(dataset_real = df_real_data, dataset_fakes=fakes_df, pca_num=pca_num)
    X_train, X_test, y_train, y_test = train_test_split(pcs_X_real, labels, stratify=labels)
    clf = MLPClassifier(random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)
    training_accuracies_ppca.append(clf.score(X_train, y_train))
    testing_accuracies_ppca.append(clf.score(X_test, y_test))
    
massive_dic_no = {'run':np.arange(1,9), 'training accuracy':training_accuracies_no, 'testing accuracy':testing_accuracies_no}
massive_dic_pca = {'pca_nums':pca_nums, 'training accuracy':training_accuracies_pca, 'testing accuracy':testing_accuracies_pca}
massive_dic_ppca = {'pca_nums':pca_nums, 'training accuracy':training_accuracies_ppca, 'testing accuracy':testing_accuracies_ppca}

print('----------------No PCA or PPCA----------------')
print(pd.DataFrame(massive_dic_no).to_string(index=False), '\n')
print('-----------------------PCA--------------------')
print(pd.DataFrame(massive_dic_pca).to_string(index=False), '\n')
print('----------------------PPCA--------------------')
print(pd.DataFrame(massive_dic_ppca).to_string(index=False))



----------------No PCA or PPCA----------------
 run  training accuracy  testing accuracy
   1           0.751351          0.758065
   2           0.756757          0.709677
   3           0.535135          0.451613
   4           0.751351          0.758065
   5           0.659459          0.677419
   6           0.443243          0.435484
   7           0.551351          0.467742
   8           0.551351          0.467742 

-----------------------PCA--------------------
 pca_nums  training accuracy  testing accuracy
        6           0.848649          0.822581
        7           0.837838          0.758065
        8           0.832432          0.790323
        9           0.859459          0.774194
       10           0.848649          0.822581
       11           0.864865          0.774194
       12           0.881081          0.822581
       13           0.875676          0.806452 

----------------------PPCA--------------------
 pca_nums  training accuracy  testing accuracy
       

