In [175]:
# import torch
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from read_yse_ztf_snana_dir import read_YSE_ZTF_snana_dir

snid_list, meta_list, yse_ztf_fp_df_list = read_YSE_ZTF_snana_dir('spec_yse_dr1')
real_data = pd.DataFrame(pd.read_csv('Real_Data.csv'))

# hyperparameters
max_iter = 500
stopping = False
alpha = 0.0001
random_state = 42

# extracting the IDs that have sufficient information 
truth_ids = list(real_data['ObjectID'])

columns = ['object_id', 'transient_spec_class']

specs = []
for dic in meta_list:
    spec = dic['transient_spec_class']
    specs.append(spec)

p = dict(zip(snid_list, specs))
ground_truth_dict = dict(zip(snid_list, specs))
ground_truth = pd.DataFrame(zip(snid_list, specs), columns=columns)
    
# extracting the spec labels corresponding to the truth ids
labels = []
for item in ground_truth_dict.keys():
    if item in truth_ids:
        labels.append(ground_truth_dict[item])

real_truths = zip(truth_ids, labels)
df_real_truths = pd.DataFrame(real_truths, columns=['id', 'class'])

# remove weird classes
# df_real_truths = df_real_truths.drop('SNIa-CSM', axis=0)
df_real_truths = df_real_truths[~df_real_truths['class'].isin(['SNIa-CSM', 'SNIIP', 'SNIIn', \
            'SNIa-91T-like', 'SNIa-SC', 'SNIax[02cx-like]', 'SLSN-II', 'SNIIb', 'SLSN-I', 'SNIb/c', 'SNIc-BL'])]

b = [x for x in range(df_real_truths.index[0], df_real_truths.index[-1] + 1)]
a = set(df_real_truths.index)
weird = list(df_real_truths.index ^ set(b))
print(np.array(weird).dtype, real_data.index.dtype)

# remove rows in real_data.csv that correspond to the removed weird classes
real_data_rem = real_data[~real_data.index.isin(weird)]

# remove the object ID column from real_data
df_real_data = real_data_rem.drop('ObjectID', axis=1)

# print(labels.to_string(index=False))

for i in range(len(df_real_truths)):
    d = df_real_truths.iloc[i]
    if d['class'] == 'SNIb' or d['class'] == 'SNIc':
        d['class'] = 'SNIbc'

labels = df_real_truths['class']

print(df_real_truths.shape, df_real_data.shape)

Reading YSE+ZTF SNANA-style data files from directory:  spec_yse_dr1
int32 int64
(247, 2) (247, 16)


  weird = list(df_real_truths.index ^ set(b))


In [176]:
snIa = pd.DataFrame(pd.read_csv('SNIa_sims.csv'))
snIbc = pd.DataFrame(pd.read_csv('SNIbc_sims.csv'))
snII = pd.DataFrame(pd.read_csv('SNII_sims.csv'))

train_data = pd.concat([snIa, snIbc, snII])

# extracting the IDs that have sufficient information 
train_ids = list(train_data['ObjectID'])

columns = ['object_id', 'transient_spec_class']

specs = []
for dic in meta_list:
    spec = dic['transient_spec_class']
    specs.append(spec)

p = dict(zip(snid_list, specs))
ground_truth_dict_train = dict(zip(snid_list, specs))
ground_truth_train = pd.DataFrame(zip(snid_list, specs), columns=columns)

train_labels = ['SNIa']*len(snIa) + ['SNII']*len(snIbc) + ['SNIbc']*len(snII)

train_truths = zip(train_ids, train_labels)
df_train_truths = pd.DataFrame(train_truths, columns=['id', 'class'])

# remove weird classes
# df_real_truths = df_real_truths.drop('SNIa-CSM', axis=0)
'''
df_train_truths = df_train_truths[~df_train_truths['class'].isin(['SNIa-CSM', 'SNIIP', 'SNIIn', \
            'SNIa-91T-like', 'SNIa-SC', 'SNIax[02cx-like]', 'SLSN-II', 'SNIIb', 'SLSN-I', 'SNIb/c', 'SNIc-BL'])]

b = [x for x in range(df_train_truths.index[0], df_train_truths.index[-1] + 1)]
a = set(df_train_truths.index)
weird = list(df_train_truths.index ^ set(b))
print(np.array(weird).dtype, train_data.index.dtype)
'''

# remove rows in real_data.csv that correspond to the removed weird classes
#train_data_rem = real_data[~train_data.index.isin(weird)]
train_data_rem = train_data

# remove the object ID column from real_data
df_train_data = train_data_rem.drop('ObjectID', axis=1)
train_labels = df_train_truths['class']

print(df_train_truths.shape, df_train_data.shape)

(51154, 2) (51154, 16)


In [177]:
'''SciKit Learn MLP Classifier on all dimensions'''
from sklearn.metrics import confusion_matrix

#X_train, X_test, y_train, y_test = train_test_split(df_real_data, labels, stratify=labels)

X_test = df_real_data
y_test = labels

X_train = df_train_data
y_train = train_labels

clf = MLPClassifier(hidden_layer_sizes=(100,100,100), random_state=1, max_iter=300, early_stopping=True).fit(X_train, y_train)

# print(clf.predict_proba(X_test).shape)

pred_labels = clf.predict(X_test)

# print the confusion matrix (should be k by k, where k = 11)
conf = confusion_matrix(y_test, pred_labels, normalize='all')
# print(conf)

print('training accuracy, no ppca: ', clf.score(X_train, y_train))
print('testing accuracy, no ppca: ', clf.score(X_test, y_test))

training accuracy, no ppca:  0.5262149587520037
testing accuracy, no ppca:  0.5060728744939271


In [178]:
'''SciKit Learn MLP Classifier on reduced dimensions'''

from PPCA import applyppca
from sklearn.preprocessing import StandardScaler
from collections import Counter

# fakes_df = pd.read_csv("./SNIa_sims.csv", index_col=0)

pca_num = 8

ss = StandardScaler()
smaller = ss.fit_transform(df_real_data)
real_small = pd.DataFrame(smaller)

ss = StandardScaler()
smaller = ss.fit_transform(df_train_data)
fake_small = pd.DataFrame(smaller)

W, var, pcs2_X_real, pcs2_X_fake, score2, coeff2 = applyppca(dataset_real=real_small, dataset_fakes=fake_small, pca_num=pca_num)

# must transpose output of applyppca
# X_train, X_test, y_train, y_test = train_test_split(pcs2_X_real.T, labels)

X_test = pcs2_X_real.T
y_test = labels

X_train = pcs2_X_fake.T
y_train = train_labels

# clf = MLPClassifier(hidden_layer_sizes=(100,100), random_state=1, max_iter=500, early_stopping=True).fit(X_train, y_train)
clf = MLPClassifier(hidden_layer_sizes=(100,100), random_state=random_state, max_iter=max_iter, early_stopping=stopping, alpha=alpha).fit(X_train, y_train)

# print("Y_train", y_train.values)
# print("probas", clf.predict_proba(X_test))
# print("counter", Counter(y_train))
pred_labels = clf.predict(X_test)

# print the confusion matrix (should be k by k, where k = 11)
conf = confusion_matrix(y_test, pred_labels)

# compare the predicted and test labels
compare = zip(np.array(y_test), pred_labels)

df_compare = pd.DataFrame(compare, columns=['truths', 'predicted'])
#print(df_compare)

print('training accuracy, ppca: ', clf.score(X_train, y_train))
print('testing accuracy, ppca: ', clf.score(X_test, y_test))

# y_pred = model.predict(x_test)
print("acc", accuracy_score(y_true=labels, y_pred=pred_labels))

print("y_train", Counter(y_train))
print("pred_labels", Counter(pred_labels))
print("true_labels", Counter(labels))

training accuracy, ppca:  0.7257496969933925
testing accuracy, ppca:  0.44534412955465585
y_train Counter({'SNIa': 19004, 'SNII': 16874, 'SNIbc': 15276})
pred_labels Counter({'SNIa': 140, 'SNII': 78, 'SNIbc': 29})
true_labels Counter({'SNIa': 183, 'SNII': 53, 'SNIbc': 11})


In [186]:
from sklearn.metrics import accuracy_score

print("acc", accuracy_score(y_true=labels, y_pred=pred_labels))

acc 0.08502024291497975


In [181]:
'''SciKit Learn MLP test on original PCA'''

from PPCA import applypca

# apply the pca
# pca_num=6



pca, pcs_X_real, pcs_X_fakes, score, coeff = applypca(dataset_real = df_real_data, dataset_fakes=df_train_data, pca_num=pca_num)

# X_train, X_test, y_train, y_test = train_test_split(pcs_X_real, labels)

X_test = pcs_X_real
y_test = labels

X_train = pcs_X_fakes
y_train = train_labels

# clf = MLPClassifier(hidden_layer_sizes=(100,100,100,100), random_state=1, max_iter=500, early_stopping=True).fit(X_train, y_train)

# print("proba", clf.predict_proba(X_test))

pred_labels = clf.predict(X_test)

# print the confusion matrix (should be k by k, where k = 11)
conf = confusion_matrix(y_test, pred_labels)

# compare the predicted and test labels
compare = zip(np.array(y_test), pred_labels)

df_compare = pd.DataFrame(compare, columns=['truths', 'predicted'])
# print(df_compare)


print('training accuracy, original pca: ', clf.score(X_train, y_train))
print('testing accuracy, original pca: ', clf.score(X_test, y_test))

training accuracy, original pca:  0.7367556789302889
testing accuracy, original pca:  0.08502024291497975


In [182]:
print("counter", Counter(y_train))
print("counter", Counter(pred_labels))
print("counter", Counter(labels))
print(df_compare)


counter Counter({'SNIa': 19004, 'SNII': 16874, 'SNIbc': 15276})
counter Counter({'SNII': 140, 'SNIbc': 106, 'SNIa': 1})
counter Counter({'SNIa': 183, 'SNII': 53, 'SNIbc': 11})
    truths predicted
0     SNIa     SNIbc
1     SNII     SNIbc
2     SNIa      SNII
3     SNIa     SNIbc
4     SNII      SNII
..     ...       ...
242   SNII     SNIbc
243   SNIa      SNII
244   SNIa      SNII
245   SNIa     SNIbc
246   SNIa     SNIbc

[247 rows x 2 columns]
