In [None]:
!pip3 install umap
!pip3 install hdbscan

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3541 sha256=ee2df55ca92dd50ab44239f0b171696b31f62ee9b4653b946965b0eb149fda9a
  Stored in directory: /root/.cache/pip/wheels/15/f1/28/53dcf7a309118ed35d810a5f9cb995217800f3f269ab5771cb
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1
Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyp

In [None]:
import warnings
warnings.simplefilter(action='ignore')

import numpy as np
import pandas as pd

import pickle
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt

import umap
import hdbscan
import keras

from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
full_data = []
file_names = []

for filename in os.listdir("lightcurves"):
  file_names.append(filename)

  data = pickle.load(open(f"lightcurves/{filename}", "rb"))
  full_data.append(data)

In [None]:
x_data = [] # shape => for each time step, store time, median passband wavelength, flux, flux error
host_galaxy_info = []
target = [] # store target class

In [None]:
import os

band_medians = {'r' : 0.4827, 'g' : 0.6223} # Median Wavelength (in Angstroms) Scaled over 10000 for ZTF


before = 30
after = 70

for ind, filename in enumerate(file_names):

  data = full_data[ind]

  ids = list(data.keys())

  for id in ids:

    cur_meta = [data[id].meta['redshift'], data[id].meta['mwebv']] # host gal info, redshift and extinction
    df = (pd.DataFrame(np.array(data[id])))

    for i in range(5): # Sigma Clipping
      mean = np.mean(df['fluxErr'])
      sigma = np.std(df['fluxErr'])

      min = mean - 3*sigma
      max = mean + 3*sigma

      df = df[((df['fluxErr'] <= max) & (df['fluxErr'] >= min))]

    cur_meta.extend([np.max(df[df['passband'] == 'r']['flux']), np.max(df[df['passband'] == 'g']['flux'])]) # Peak flux in each passband, unscaled

    scaler = MinMaxScaler()

    denom = np.max(df['flux']) - np.min(df['flux'])
    df.fluxErr = df['fluxErr'] / denom # Scale Flux

    df.flux = scaler.fit_transform(np.array(df['flux']).reshape(-1, 1)).flatten()

    trigger_mjd = 0

    df = df[(df['time'] > trigger_mjd - before) & (df['time'] < trigger_mjd + after)] # Scale Time


    if (len(df[df['time'] < 0]) < 2):
      continue

    df.sort_values("time", inplace=True)
    df.drop('photflag', axis=1, inplace=True)

    df.time = (df.time - (-before)) / (after + before)


    df['passband'] = df['passband'].map(band_medians)

    x_data.append(np.array(df))
    target.append(filename)
    host_galaxy_info.append(cur_meta)

In [None]:
def load(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)

target = load("/content/drive/My Drive/plasticc_train_data/preprocesseddata/target")
x = load("/content/drive/My Drive/plasticc_train_data/preprocesseddata/x")
host_galaxy_info = load("/content/drive/My Drive/plasticc_train_data/preprocesseddata/host_galaxy_info")


In [None]:
classes = np.unique(target)

anom_inds = [1, 5, 12, 13, 16]
for i in anom_inds:
    print(classes[i])

# Indices of anomalous classes

In [None]:
# Cut everything with less than 10 points and get length of all lc's

lengths = []
delete = []

for ind, val in enumerate(x):
  if (len(val) < 10):
    delete.append(ind)
  lengths.append(len(val))

In [None]:
for i in range(len(delete) - 1, -1, -1):
    del x[delete[i]]
    del target[delete[i]]
    del host_galaxy_info[delete[i]]

In [None]:
# Log of Peak Flux

host_galaxy_info = np.array(host_galaxy_info)

host_galaxy_info[:, 3] = np.log(host_galaxy_info[:, 3])
host_galaxy_info[:, 2] = np.log(host_galaxy_info[:, 2])

In [None]:
# shouldn't be negative

delete = []

for ind, i in enumerate(host_galaxy_info):
    if (np.isnan(host_galaxy_info[ind][2]) or np.isnan(host_galaxy_info[ind][3])):
        delete.append(ind)

In [None]:
host_galaxy_info = list(host_galaxy_info)

for i in range(len(delete) - 1, -1, -1):

    del x[delete[i]]
    del target[delete[i]]
    del host_galaxy_info[delete[i]]

In [None]:
host_galaxy_info = np.array(host_galaxy_info)
host_galaxy_info[:, 2] = host_galaxy_info[:, 2] / 10
host_galaxy_info[:, 3] = host_galaxy_info[:, 3] / 10

# Divide peaks by 10 for further scaling

In [None]:
# Pad for TF masking layer

ntimesteps = np.max(lengths)

for ind in range(len(x)):
  x[ind] = np.pad(x[ind], ((0, ntimesteps - len(x[ind])), (0, 0)))

In [None]:
# Split data

y_data_anom = []
y_data = []
x_data = []
x_data_anom = []
host_gal_anom = []
host_gal = []

anom_classes = [classes[i] for i in anom_inds]

for i in range(len(target)):
    if (target[i] == 'lc_classnum_AGN_old.pickle'): # ignore AGNS
        continue
    if (target[i] in anom_classes):
        x_data_anom.append(x[i])
        y_data_anom.append(target[i])
        host_gal_anom.append(host_galaxy_info[i])

    else:
        x_data.append(x[i])
        y_data.append(target[i])
        host_gal.append(host_galaxy_info[i])



In [None]:
# One-hot Encoding

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

y_data = enc.fit_transform(np.array(y_data).reshape(-1, 1)).todense()

In [None]:
# Train-test split

X_train, X_test, host_gal_train, host_gal_test, y_train, y_test = train_test_split(x_data, host_gal, y_data, random_state = 40, test_size = 0.1)

X_train, X_val, host_gal_train, host_gal_val, y_train, y_val = train_test_split(X_train, host_gal_train, y_train, random_state = 40, test_size = 0.125)

In [None]:
class_weights = {i : 0 for i in range(y_train.shape[1])}

for value in y_train:
  class_weights[np.argmax(value)]+=1

for id in class_weights.keys():
  class_weights[id] = len(y_train) / class_weights[id]


In [None]:
# Model

import keras
from keras.layers import Input, LSTM, TimeDistributed, Dense, Masking, concatenate, GRU


num_classes = len(class_weights)
n_features = 4

input_1 = Input((ntimesteps, n_features))  # X.shape = (Nobjects, Ntimesteps, 4)

masking_input1 = Masking(mask_value=0.)(input_1)

lstm1 = GRU(100, return_sequences=True, activation='tanh', recurrent_activation='hard_sigmoid')(masking_input1)
lstm2 = GRU(100, return_sequences=False, activation='relu', recurrent_activation='hard_sigmoid')(lstm1)

dense1 = Dense(100, activation='relu')(lstm2)

input_2 = Input(shape = (len(host_galaxy_info[0]), ))

dense2 = Dense(30)(input_2)

merge1 = concatenate([dense1, dense2])

dense3 = Dense(100, activation='relu')(merge1)

dense4 = Dense(9, activation='relu')(dense3)

output = Dense(num_classes, activation='softmax')(dense4)

model = keras.Model(inputs=[input_1, input_2], outputs=output)
model.summary()

model.compile(loss = "categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
host_gal_train = np.array(host_gal_train)
host_gal_val = np.array(host_gal_val)

In [None]:
try:
    model.fit(x = [X_train, host_gal_train], y = y_train, epochs=100, batch_size = 128, class_weight = class_weights, validation_data=([X_val, host_gal_val], y_val))
except KeyboardInterrupt:
    save_path = "trained_model"
    model.save(save_path)
    print('Output saved to: "{}./*"'.format(save_path))