In [None]:
import csv
from numpy import genfromtxt
import numpy as np
import pandas as pd
from random import random
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import math
import sklearn.linear_model

# Function to check and remove NaNs from dataset
def dataChecker(arr):
    idxRow = -1
    for row in arr:
        idxRow = idxRow + 1
        for idx in range(len(row)):
            if math.isnan(arr[idxRow,idx]) == True:
                arr[idxRow, idx] = 0
    return arr

# Find max value in the dataset and its index
def maxVal(arr):
    idxRow = -1
    maxVal = -100
    indexes = np.empty(2)
    for row in arr:
        idxRow = idxRow + 1
        for idx in range(len(row)):
            if ((arr[idxRow,idx] > maxVal) and (idx != 0 and idx != 4 and idx != 5 and idx != 6 and idx != 7 and idx != 8)):
                maxVal = arr[idxRow, idx]
                indexes[0] = idxRow
                indexes[1] = idx
    return indexes, maxVal

# Find max value in the dataset and its index
def minVal(arr):
    idxRow = -1
    minVal = 100
    indexes = np.empty(2)
    for row in arr:
        idxRow = idxRow + 1
        for idx in range(len(row)):
            if ((arr[idxRow,idx] < minVal) and (idx != 0 and idx != 4 and idx != 5 and idx != 6 and idx != 7 and idx != 8)):
                minVal = arr[idxRow, idx]
                indexes[0] = idxRow
                indexes[1] = idx
    return indexes, minVal

# Scale all values in the array that are the waveform or waveform-dependent to a range
def scaleVals(arrIn, arrOut, minAllowed, maxAllowed, minValue, maxValue):
    idxRow = -1
    for row in arrIn:
        idxRow = idxRow + 1
        for idx in range(len(row)):
            if(idx != 0 and idx != 4 and idx != 5 and idx != 6 and idx != 7 and idx != 8):
                scaled = (((maxAllowed - minAllowed) * (arrIn[idxRow,idx] - minValue)) / (maxValue - minValue)) + minAllowed
                arrOut[idxRow, idx] = scaled
            else:
                arrOut[idxRow, idx] = arrIn[idxRow,idx]
    return arrOut

# Perform Recursive Feature Elimination to identify the 3 top features
def RFE(arr):
    #data = X, target = Y
    X = arr[:,1:9]
    Y = arr[:,0]

    #Feature extraction
    model = sklearn.linear_model.LogisticRegression() 
    rfeFeatures = sklearn.feature_selection.RFE(model, 3)
    fit = rfeFeatures.fit(X,Y)
    return fit.ranking_

#  Number of waveforms for each neuron cell type 
valsFS = 1438775
valsPT = 319484
valsIT = 126460

#  Number of rows in each array
rows_FS = valsFS
rows_PT = valsPT
rows_IT = valsIT

#  Separation value to split up training:testing sets (67:33)
sep_FS = 2 * rows_FS // 3
sep_PT = 2 * rows_PT // 3
sep_IT = 2 * rows_IT // 3

#  Create training sets
col = 38
trainArrSize = sep_FS
train_set_FS = np.empty((trainArrSize,col))
train_set_PT_attr = np.empty((trainArrSize,col))
train_set_IT_attr = np.empty((trainArrSize,col))

#  Fill the training sets with the 66% that is already existent (prior to oversampling)
for indFS_init in range(sep_FS):
    train_set_FS[indFS_init, :] = FS[indFS_init,:]

for indPT_init in range(sep_PT):
    train_set_PT_attr[indPT_init, :] = PT[indPT_init,:]

for indIT_init in range(sep_IT):
    train_set_IT_attr[indIT_init, :] = IT[indIT_init,:]

#  Fill the test sets to completion
test_set_FS = np.empty((0,col)) 
test_size_FS = valsFS - sep_FS
test_set_PT = np.zeros((0,col)) 
test_size_PT = valsPT - sep_PT
test_set_IT = np.zeros((0,col)) 
test_size_IT = valsIT - sep_IT

test_set_FS = np.append(test_set_FS, FS[sep_FS:valsFS, :], axis = 0)
test_set_PT = np.append(test_set_PT, PT[sep_PT:valsPT, :], axis = 0)
test_set_IT = np.append(test_set_IT, IT[sep_IT:valsIT, :], axis = 0)

#  Oversampling the minority with replacement

#  Determine how much to add to PT/IT and size of pre-oversampling array
numAdd_PT = sep_FS - sep_PT
numAdd_IT = sep_FS - sep_IT
trainPTArrSize = sep_PT
trainITArrSize = sep_IT

# Randomize attribute-wise (_attr) for all features but the waveform,
#           which will be randomized as single unit
for indPT_2 in range(trainPTArrSize,numAdd_PT+trainPTArrSize):
    for attrPT in range(9):
        rand = int(random() * (sep_PT+1))
        train_set_PT_attr[indPT_2,attrPT] = train_set_PT_attr[rand, attrPT]
    rand = int(random() * (sep_PT+1))
    train_set_PT_attr[indPT_2, 9:] = train_set_PT_attr[rand, 9:]

for indIT_2 in range(trainITArrSize,numAdd_IT+trainITArrSize):
    for attrIT in range(9):
        rand = int(random() * (sep_IT+1))
        train_set_IT_attr[indIT_2,attrIT] = train_set_IT_attr[rand, attrIT]
    rand = int(random() * (sep_IT+1))
    train_set_IT_attr[indIT_2, 9:] = train_set_IT_attr[rand, 9:]

#  Randomly combine individual training and testing sets into master training and testing sets
train_set_attr = np.empty((trainArrSize * 3, col))
countFS = 0
countPT = 0
countIT = 0
indTrain= 0
while indTrain < (trainArrSize * 3):
    rand = int(random() * 3 + 1)
    if rand == 1 and (countFS + 1 <= trainArrSize):
        train_set_attr[indTrain,:] = train_set_FS[countFS,:]
        countFS = countFS + 1
        indTrain = indTrain + 1
    elif rand == 2 and (countPT + 1 <= trainArrSize):
        train_set_attr[indTrain,:] = train_set_PT_attr[countPT,:]
        countPT = countPT + 1 
        indTrain = indTrain + 1
    elif rand == 3 and (countIT + 1 <= trainArrSize):
        train_set_attr[indTrain,:] = train_set_IT_attr[countIT,:]
        countIT = countIT + 1 
        indTrain = indTrain + 1

test_set = np.empty((test_size_FS + test_size_PT + test_size_IT, col))
countFS = 0
countPT = 0
countIT = 0        
indTest = 0
while indTest < (test_size_FS + test_size_PT + test_size_IT):
    rand = int(random() * 3 + 1)
    if rand == 1 and (countFS + 1 <= test_size_FS):
        test_set[indTest,:] = test_set_FS[countFS,:]
        countFS = countFS + 1
        indTest = indTest + 1
    elif rand == 2 and (countPT + 1 <= test_size_PT):
        test_set[indTest,:] = test_set_PT[countPT,:]
        countPT = countPT + 1 
        indTest = indTest + 1
    elif rand == 3 and (countIT + 1 <= test_size_IT):
        test_set[indTest,:] = test_set_IT[countIT,:]
        countIT = countIT + 1 
        indTest = indTest + 1

# Remove NaNs in each array
train_set_attr = dataChecker(train_set_attr)
test_set = dataChecker(test_set)

# Scaling inputs to 0-1
    
train_set_attr_scld = np.empty((2877549, 38))
test_set_scld = np.empty((628241, 38))

minValue = -0.00098502
maxValue = 0.0011485

train_set_attr_scld = scaleVals(train_set_attr, train_set_attr_scld, 0, 1, minValue, maxValue)
test_set_scld = scaleVals(test_set, test_set_scld, 0, 1, minValue, maxValue)

# Save files as a .csv 
np.savetxt('train_set_attr_scld.csv', train_set_attr_scld, delimiter = ",")
np.savetxt('test_set_scld.csv', test_set_scld, delimiter = ",")