In [1]:
import pandas as pd
import numpy as np
import torch
import json
from normalizer import counter
from torch.utils.data import DataLoader


In [2]:
# All of our fields have names, but we need them in numbers, so define a dictionary to convert.
global magDict
magDict = {
    'TOTUSJH': 0,
    'TOTBSQ': 1,
    'TOTPOT': 2,
    'TOTUSJZ': 3,
    'ABSNJZH': 4,
    'SAVNCPP': 5,
    'USFLUX': 6,
    'TOTFZ': 7,
    'MEANPOT': 8,
    'EPSZ': 9,
    'SHRGT45': 10,
    'MEANSHR': 11,
    'MEANGAM': 12,
    'MEANGBT': 13,
    'MEANGBZ': 14,
    'MEANGBH': 15,
    'MEANJZH': 16,
    'TOTFY': 17,
    'MEANJZD': 18,
    'MEANALP': 19,
    'TOTFX': 20,
    'EPSY': 21,
    'EPSX': 22,
    'R_VALUE': 23,
    'RBZ_VALUE': 24,
    'RBT_VALUE': 25,
    'RBP_VALUE': 26,
    'FDIM': 27,
    'BZ_FDIM': 28,
    'BT_FDIM': 29,
    'BP_FDIM': 30,
    'PIL_LEN': 31,
    'XR_MAX': 32
}

In [None]:
# Get the data from the JSON file, then return it as a tensor of input data and a list of labels
def getDataFromJSON(path="data/train_partition1_data.json", device='cpu', earlyStop=-1):
    # I might need to refactor these arguments to allow to get the test data. Problem for another day.
    # path is the path to the files, device is where to store it (CUDA), earlyStop is how many lines to 
    # read if you don't want the entire file read.
    
    # Get the dictionary to assign names to numbers
    global magDict
    
    # This dataset is heavily skewed, so we need to get the number of each type of flare.
    # This also lets us get the number of lines in the file with a sum.
    weights = counter(path, earlyStop)
    lines = np.sum(weights)
    # Check when we want to stop - the end of the file or earlier.
    if earlyStop < 0: length = lines
    else: length = min(earlyStop, lines)
    
    # Get the file and open it. 
    file = open(path)    
    
    # Declare a tensor to hold the data, and a list to hold the labels.
    # Dimensions: 0: number of entries we want. 1: the 33 fields in the data. 2: the 60 observations in each field. 
    tnsr = torch.Tensor().new_empty((length, 33, 60), device=device)
    labels = []
    flares = {'X':0, 'M':1, 'C':2, 'B':3, 'Q':4}
        
    row = -1
    for line in file:
        # Load the line as a dictionary. Row is an integer place and v is a smaller dictionary.
        d: dict = json.loads(line)
        row += 1
        for _, v in d.items(): # we use the _ because we don't want the ID.
            if earlyStop > 0 and row >= earlyStop:
                # If we don't want the entire dataset, stop loading more than we want
                return tnsr, labels, weights
            if row % 100 == 0:
                print(f'Now loading event {row}/{length}')
            # append the label to our list
            labels.append(flares[v['label']])
            
            # Break each individual dictionary into dictionaries of observations
            # Key is the string in magDict, and timeDict is a dictionary of observations over time
            for key, timeDict in v['values'].items():
                # Turn our name string into a numeric value
                location = magDict[key]
                # Get the measurements out of the time series dictionary
                for timeStamp, measurement in timeDict.items():
                    tnsr[row][location][int(timeStamp)] = measurement
    print(f'{row} lines loaded.')
    # Close the file. I'm not a heathen                    
    file.close()
    # This might be a good place to perform some post processing, but that's a question for another day.
    # Famous last words.
    return tnsr, labels, weights



In [None]:
# This file has 77270 data points. 
%time train1, labels1, weights1 = getDataFromJSON(path="data/train_partition1_data.json", earlyStop=10000)

In [5]:
# This file has 93767 data points. 
%time train2, labels2, weights2 = getDataFromJSON(path="data/train_partition2_data.json", earlyStop=10000)

Now loading event 0/10000
Now loading event 100/10000
Now loading event 200/10000
Now loading event 300/10000
Now loading event 400/10000
Now loading event 500/10000
Now loading event 600/10000
Now loading event 700/10000
Now loading event 800/10000
Now loading event 900/10000
Now loading event 1000/10000
Now loading event 1100/10000
Now loading event 1200/10000
Now loading event 1300/10000
Now loading event 1400/10000
Now loading event 1500/10000
Now loading event 1600/10000
Now loading event 1700/10000
Now loading event 1800/10000
Now loading event 1900/10000
Now loading event 2000/10000
Now loading event 2100/10000
Now loading event 2200/10000
Now loading event 2300/10000
Now loading event 2400/10000
Now loading event 2500/10000
Now loading event 2600/10000
Now loading event 2700/10000
Now loading event 2800/10000
Now loading event 2900/10000
Now loading event 3000/10000
Now loading event 3100/10000
Now loading event 3200/10000
Now loading event 3300/10000
Now loading event 3400/100

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
# This file has 42986 data points. 
%time train3, labels3, weights3 = getDataFromJSON(path="data/train_partition3_data.json", earlyStop=10000)

Now loading event 0/10000
Now loading event 100/10000
Now loading event 200/10000
Now loading event 300/10000
Now loading event 400/10000
Now loading event 500/10000
Now loading event 600/10000
Now loading event 700/10000
Now loading event 800/10000
Now loading event 900/10000
Now loading event 1000/10000
Now loading event 1100/10000
Now loading event 1200/10000
Now loading event 1300/10000
Now loading event 1400/10000
Now loading event 1500/10000
Now loading event 1600/10000
Now loading event 1700/10000
Now loading event 1800/10000
Now loading event 1900/10000
Now loading event 2000/10000
Now loading event 2100/10000
Now loading event 2200/10000
Now loading event 2300/10000
Now loading event 2400/10000
Now loading event 2500/10000
Now loading event 2600/10000
Now loading event 2700/10000
Now loading event 2800/10000
Now loading event 2900/10000
Now loading event 3000/10000
Now loading event 3100/10000
Now loading event 3200/10000
Now loading event 3300/10000
Now loading event 3400/100

In [None]:
train2[0,0]

In [None]:
# Define the network. Make sure to end with nn.Softmax activation
import torch.nn as nn
!pip install skorch
from skorch import NeuralNet

class MyModule(nn.Module):
  def __init__(self, hidden_size1, hidden_size2, num_classes=5, drop1=.5, input_size=1980):
    super().__init__() 
    self.layer1 = nn.Linear(input_size, hidden_size1)
    self.layer2 = nn.Linear(hidden_size1, hidden_size2)
    layerout = nn.Linear(hidden_size2, num_classes)
    #Define a RELU Activation unit
    self.relu = nn.ReLU()  
    self.smax = nn.Softmax(dim=1)
    self.drop = nn.Dropout(p=drop1)
    
  def forward(self, x):
    #Forward Propagate through the layers as defined above
    y = self.drop(x.reshape(-1, 1980))
    y = self.drop(self.relu(self.layer1(y)))
    y = self.smax(self.layer2(y))
    y = self.layerout(y)
    return y





In [None]:
def train(model, inputs, labels, weight, valSets, valLabels, valweight):
    # TODO: Is this right? How do I determine the weights here?
    weight = torch.Tensor(weight)
    lfc = nn.CrossEntropyLoss(weight=1/weight)
    #ideas
    # 1-(weight/np.sum(weight))
    # .2/weight - this one normalizes so that each class is responsible for 20% of the loss
    # 1/weight - this is a bit naive, but the classes with fewer items are weighted more.
    
    # Hyperparameters
    batch = 256
    epochs = 10
    lr = 0.01
    
    # Start a dataloader object
    data = list(zip(inputs,labels))
    val = list(zip(valSets,valLabels))
    loader = DataLoader(data, batch_size = batch, num_workers=4)
    valLoader = DataLoader(val, batch_size = int(n/4), num_workers=4)
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range epoch:
        model.train()
        batch_loss = []
        for (xtrain,ytrain) in loader:
            output = model(xtrain)
            loss = lfc(output,ytrain)
            opt.zero_grad()
            loss.backward()
            opt.step()
            batchLoss.append(loss.item())
        print(f'The training loss for epoch {epoch+1}/{epochs} was {np.mean(batchLoss)}')
        model.eval()
        for (xval,yval) in valloader:
            output = model(xval)
            loss = lfc(output,yval)
            batchLoss.append(loss.item())
            unbalancedAccuracy = torch.mean(torch.Tensor(yval) == torch.argmax(output))
            
            balanced = torch.Tensor.new_empty((len(yval), 5))
            # TODO: figure this one out.
            for i in range(5)
                if i is not 
            
        print(f'The validation loss was :   {epoch+1}/{epochs} was {np.mean(batchLoss)}')
