In [6]:
# library preparation
import pandas as pd
import numpy as np
import time
import os

In [44]:
# Custom I/O functions
def read_labels(filename):
    """
    Reads the labels.csv file containing info on
    which samples contain SAS or not

    Input: 
        filename: file path to labels.csv
    Output:
        y: a numpy array of labels "Yes" vs "No"
    """

    y = pd.read_csv(filename)
    y = y.sort_values(by="indices").drop("indices", axis=1).to_numpy()

    return y

def read_samples(files, filePath):
    """
    Read in the output files from SA.wrapper.r
    and save it as a compressed file using numpy
    
    Input:
        files: an iterable of names that indicate which output
        files to read; len(samples) must be greater than or equal to one
        
        filePath: path to the folder containing all output files
        
    Output:
        a numpy ndarray of all output files concatenated together
    """

    iterator = iter(files)
    
    # grab the first output data and use it as a baseline
    firstFile = next(iterator)
    fileStart = os.path.join(filePath, firstFile)
    X = pd.read_table(fileStart, header=None).to_numpy()
    rows, columns = X.shape

    # We now concatenate all samples together
    # Note: THIS COULD TAKE A WHILE! (only run once)
    start = time.time()
    for fileName in iterator:
        fullPath = os.path.join(filePath, fileName)
        sample = pd.read_table(fullPath, header=None).to_numpy()
        X = np.concatenate((X, sample), axis=0)
    end = time.time()
    print('Time taken to concatenate files:', end - start)

    # reshape to samples, rows, columns
    X = X.reshape(-1, rows, columns)

    return X

def clean_data(X, y, threshold):
    """
    Ensure no element is nan or infinity, otherwise scikit-learn 
    will not like that.
    
    If samples with nan or infinity is below a threshold amount of the 
    total data/samples in X, then we drop them
    
    Input: 
        X: a three dimensional numpy array
            + Assume first dimension = # of samples
            + Assume second dimension = # of rows in each sample
            + Assume third dimension = # of columns in each sample
        y: the corresponding label values for X
        
    Output:
        X_clean: X cleaned if mising data is below threshold, otherwise X
        y_clean: y cleaned if missing data is below threshold, otherwise y
        samples_clean: number of samples left after cleaning (max is X.shape[0])
    """

    assert len(X.shape) == 3
    assert y.shape[0] == X.shape[0]
    samples = X.shape[0]
    print("Dected", samples, "samples total")

    has_NAN = np.any(np.isnan(X))
    samples_with_nans = set()
    if has_NAN:
        print("We got NaNs, identifying location of NaNs...")
        indices_nan = np.argwhere(np.isnan(X))
        samples_with_nans.update(set(indices_nan[:, 0]))
        columns_with_nans = set(indices_nan[:, 2])
        print(f"Attributes with NaNs are: {columns_with_nans}")
        # print(f"Samples with NaNs are: {samples_with_nans}")
        print(f"There are ", len(samples_with_nans), " Samples with NANs")

    has_Inf = np.any(np.isinf(X))
    samples_with_Infs = set()
    if has_Inf:
        print("We got Infinities, identifying location of Infs...")
        indices_Infs = np.argwhere(np.isinf(X))
        samples_with_Infs.update(set(indices_Infs[:, 0]))
        columns_with_Infs = set(indices_Infs[:, 2])
        print(f"Attributes with Infs are: {columns_with_Infs}")
        # print(f"Samples with Infs are: {samples_with_nans}")
        print(f"There are ", len(samples_with_Infs), " Samples with Infs")

    # Attempting to clean the dataset from NaNs and Infs
    bug_samples = set()
    bug_samples.update(samples_with_nans)
    bug_samples.update(samples_with_Infs)
    print(f"There are a total of {len(bug_samples)} containing Infs or NaNs")
    bug_sample_l = list(bug_samples)
    bug_sample_l.sort()
    print("Deleting samples ", bug_sample_l)

    if len(bug_samples) < threshold*samples:
        print(f"Less than {threshold*100}% of Samples, dropping...")
        X_clean = np.delete(X, bug_sample_l, axis = 0)
        y_clean = np.delete(y, bug_sample_l, axis = 0)
        samples_clean = samples - len(bug_sample_l)
        print(f"After dropping, we have {samples_clean} samples left")
    else:
        print("Too many Samples to drop, can't clean!")
        
    return X_clean, y_clean, samples_clean

## Training Data

We first read in training samples using an iterable of filenames and 'compress' down to one dimesnion using `reshape` so that RandomForest can evaluate it correctly

The training samples consist of a 50-50 split betwee neutral samples and SAS samples of shape (160, 2) per sample. 
I choose the 50-50 ratio because **class imabalnce** is often a big headache in model fitting and for the sake of training, a 50-50 split will best enable the classifier to capture the signals of SAS. 

The attribute key is as belows:

1. Maximum fst in a 2.5 rho window
2. Mean-squared error for the highest fst peak in a window

`y` will contain label information for if sample i has SAS or not of shape (# of samples, 1) with one aitribute:

1. SAS (Yes or No)

In [4]:
y_train = read_labels("./GenInput/input/label.csv")
print(y_train[:5])
print(y_train.shape)

[['Yes']
 ['Yes']
 ['Yes']
 ['No']
 ['No']]
(10000, 1)


In [41]:
# generator, actual cost = O(1)
files = ("output_" + str(i) for i in range(1, 10000+1))
filePath = "./GenInput/output"

X_train = read_samples(files, filePath)
print(X_train.shape)
X_train[0][:10, ] # check first 10 rows of first sample

Time taken to concatenate files: 18.96846079826355
(10000, 160, 2)


array([[0.39257336, 0.14608226],
       [0.23921614, 0.0362196 ],
       [0.39257336, 0.1809441 ],
       [0.35295858, 0.15712493],
       [0.35295858, 0.19305147],
       [0.24498803, 0.05261772],
       [0.24498803, 0.0496599 ],
       [0.24498803, 0.0707043 ],
       [0.24498803, 0.05170502],
       [0.24498803, 0.06001001]])

In [46]:
X_clean, y_clean, samples_clean = clean_data(X_train, y_train, threshold=0.01)

Dected 10000 samples total
There are a total of 0 containing Infs or NaNs
Deleting samples  []
Less than 1.0% of Samples, dropping...
After dropping, we have 10000 samples left


In [48]:
# store the result into a numpy object
np.savez_compressed('./data/train.npz', 
                    X_train=X_clean, y_train=y_clean)

## Small Test Data Batch

I then asked the simulation for a 50-50 split, 1000 sample test set that's not used in the model building process at all for an earnest attempt at out-of-sample prediction error.

In [49]:
y_test = read_labels("./GenInputTest/input/label.csv")
print(y_test[:5])
print(y_test.shape)

[['Yes']
 ['No']
 ['Yes']
 ['Yes']
 ['Yes']]
(1000, 1)


In [51]:
# generator, actual cost = O(1)
files = ("output_" + str(i) for i in range(1, 1000+1))
filePath = "./GenInputTest/output"

X_test = read_samples(files, filePath)
print(X_test.shape)
X_test[0][:10, ] # check first 10 rows of first sample

Time taken to concatenate files: 0.725487470626831
(1000, 160, 2)


array([[0.43241981, 0.26219987],
       [0.43241981, 0.32465513],
       [0.35295858, 0.24039368],
       [0.43241981, 0.31260821],
       [0.43241981, 0.3246583 ],
       [0.43241981, 0.39181164],
       [0.43241981, 0.40586182],
       [0.43241981, 0.39830596],
       [0.43241981, 0.43595325],
       [0.43241981, 0.37849837]])

In [54]:
X_test_clean, y_test_clean, samples_test = clean_data(
    X_test, y_test, threshold=0.01)
print("\n")
print(X_test_clean.shape)
print(y_test_clean.shape)

Dected 1000 samples total
There are a total of 0 containing Infs or NaNs
Deleting samples  []
Less than 1.0% of Samples, dropping...
After dropping, we have 1000 samples left


(1000, 160, 2)
(1000, 1)


In [55]:
# store the result into a numpy object
np.savez_compressed('./data/test.npz', X_test=X_test_clean, y_test=y_test_clean)