In [1]:
#!/usr/bin/python3
import random

# Global variable to define the two output labels - can be changed to SNP/PD etc.
SNP = 'snp'
PD = 'pd'


def print_data(inData, classData, usedLines):
    """ Input:  inData    array of input data
                classData array of classes assigned
                usedLines array of line indexes to print

    Prints the selected lines
    """
    for idx in range(len(inData)):
        if usedLines[idx]:
            print(",".join(inData[idx]) + "," + classData[idx])


def balance(inData, classData, minorityClass, minoritySize):
    """ Input:    inData        array of input data
                  classData     array of classes assigned
                  minorityClass class label for the minority class
                  minoritySize  size of the minority class
         Returns: array of indexes that are of interest for a
                  balanced dataset

    Performs the actual balancing
    """

    usedLines = [False] * len(inData)
    for idx in range(len(inData)):
        if classData[idx] == minorityClass:
            usedLines[idx] = True
        else:
            usedLines[idx] = False

    usedCount = 0
    while usedCount < minoritySize:
        idx = random.randrange(len(inData))
        if usedLines[idx] == False:
            usedLines[idx] = True
            usedCount += 1

    return (usedLines)


def split_line(line):
    """ Input:   line   Line of input/output data
        Returns:        The first part of the line (the features)
                        and the second part of the line (the
                        class label)
    Splits a line from the data file into the feature data and
    the class label - can be tweaked as needed
    """
    return line.rsplit(',', 1)


def read_csv(file):
    """
    Input:    file       The filename to read
    Returns:  inData     An array of the lines of input feature data
              classData  An array of the lines of assigned class labels
              header     An array of header (comment) lines

    Very simple CSV reader that splits only into feature data and label
    """
    inData = []
    classData = []
    header = []
    with open(file, "r") as inputFile:
        for line in inputFile:
            line = line.strip()
            if line[0] == '#':
                # header.append(line)
                header = line[1:].split(",")
            else:
                es = line.split(",")
                input_data = es[:-1]
                label = es[-1]
                # input_data, label = split_line(line)
                inData.append(input_data)
                classData.append(label)
    return (inData, classData, header)


def find_minority_class(classData):
    """ Input:    classData  Array of class labels
        Returns:  minClass   The label for the minority class
                  minSize    The number of items in the minority class
                  maxSize    The number of items in the majority class
    Finds information about the inbalance in class sizes
    """
    N_zero = 0
    N_one = 0
    for datum in classData:
        if (datum == PD):
            N_one += 1
        elif (datum == SNP):
            N_zero += 1

    minClass = SNP
    minSize = N_zero
    maxSize = N_one
    if (N_zero > N_one):
        minClass = PD
        minSize = N_one
        maxSize = N_zero

    return (minClass, minSize, maxSize)


if __name__ == "__main__":
    inData, classData, header = read_csv(file="E2.csv")

    minorityClass, minoritySize, majoritySize = find_minority_class(classData)

    nFolds = 1 + 3 * int(majoritySize / minoritySize)

    for fold in range(nFolds):
        print("\nFold ", fold)
        for h in header:
            print(h)
        usedLines = balance(inData, classData, minorityClass, minoritySize)
        print_data(inData, classData, usedLines)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

