In [1]:
import sys
import numpy as np
import pickle
import os

In [2]:
os.chdir("/home/thlamp/tcga/python_results")

In [3]:
def progress(s):
    sys.stdout.write("%s" % (str(s)))
    sys.stdout.flush()


def message(s):
    sys.stdout.write("%s\n" % (str(s)))
    sys.stdout.flush()

In [4]:
global FEATURE_VECTOR_FILENAME
FEATURE_VECTOR_FILENAME = "/home/thlamp/tcga/bladder_results/raw_data_integrated_matrix.txt"
global Prefix
Prefix = ""

In [54]:
def initializeFeatureMatrices(bResetFiles=False, bPostProcessing=True, bNormalize=True,
                              bNormalizeLog2Scale=True):
    """
    Initializes the case/instance feature matrices, also creating intermediate files for faster startup.

    :param bResetFiles: If True, then reset/recalculate intermediate files. Default: False.
    :param bPostProcessing: If True, then apply post-processing to remove NaNs, etc. Default: True.
    :param bNormalize: If True, then apply normalization to the initial data. Default: True.
    :param bNormalizeLog2Scale: If True, then apply log2 scaling after normalization to the initial data. Default: True.
    :return: The initial feature matrix of the cases/instances.
    """

    message("Opening files...")

    try:
        if bResetFiles:
            raise Exception("User requested file reset...")
        message("Trying to load saved data...")

        # Apply np.load hack
        ###################
        # save np.load
        np_load_old = np.load ##return the input array from a disk file with npy extension(.npy)

        # modify the default parameters of np.load
        np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)##lambda *a, **k: orizetai to lamda function and *a, **k function parameters
        ##np.load is a function provided by the NumPy library, typically used for loading data from saved files
        ## BUT np.load = ... assigns a new behavior to the np.load function. This assignment changes how the np.load function works for the duration of the current scope or context in which it's defined
        
        ##lambda *a, **k: defines an anonymous function (a lambda function) that takes any number of positional arguments as a tuple a and any number of keyword arguments as a dictionary k. This lambda function is like a wrapper around the original np.load function.
        ## call load_data with allow_pickle implicitly set to true
        
        ##allow_pickle=True: An additional keyword argument specifying that pickled objects are allowed to be loaded.
        
        datafile = np.load(Prefix + "patientAndControlData.mat.npy")
        labelfile = np.load(Prefix + "patientAndControlDataLabels.mat.npy")

        # restore np.load for future normal usage
        np.load = np_load_old ##orizei to np.load sthn default, arxikh leitourgia
        ####################

        #!!!!!!!!!!!clinicalfile = loadTumorStage()##epistrefei A matrix indicating the tumor stage per case/instance
        message("Trying to load saved data... Done.")
    except Exception as eCur:
        message("Trying to load saved data... Failed:\n%s" % (str(eCur)))
        message("Trying to load saved data from txt...")
        fControl = open(FEATURE_VECTOR_FILENAME, "r")
        message("Loading labels and ids...")
        # labelfile, should have stored tumor_stage or labels?       
        
        labelfile = np.genfromtxt(fControl, skip_header=1, usecols=(0, 100472),
                                  missing_values=['NA', "na", '-', '--', 'n/a'],
                                  dtype=np.dtype("object"), delimiter=' ').astype(str)
        ##numpy.genfromtxt function to read data from a file. This function is commonly used to load data from text files into a NumPy array.
        ##dtype=np.dtype("object"): This sets the data type for the resulting NumPy array to "object," which is a generic data type that can hold any type of data
        
        #+ removes " from first column 
        labelfile[:, 0] = np.char.replace(labelfile[:, 0], '"', '')
        
        message("This is the label file...")
        message(labelfile)
        
        message("Splitting features, this is the size of labelfile")
        message(np.shape(labelfile))

        message("Loading labels and ids... Done.")
        #---substitute loadTumorStage with clinicalfile = np.genfromtxt ...
        #clinicalfile = loadTumorStage()
        # Reset the file cursor to the beginning
        fControl.seek(0)
        
        clinicalfile = np.genfromtxt(fControl, skip_header=1, usecols=(0, 100473),
                                  missing_values=['NA', "na", '-', '--', 'n/a'],
                                  dtype=np.dtype("object"), delimiter=' ').astype(str)

        clinicalfile[:, 0] = np.char.replace(labelfile[:, 0], '"', '')
        
        message("This is the clinical file...")
        message(clinicalfile)
        
        message("These are the dimensions of the clinical file")
        message(np.shape(clinicalfile))
        
        fControl.close()

        datafile = loadPatientAndControlData()##return: the patient and control feature data file as a matrix
        message("Trying to load saved data from txt... Done.")
        print(datafile[:5,:5])
        # Saving
        saveLoadedData(datafile, labelfile)##Saves intermediate data and label file matrices for quick loading.

    message("Opening files... Done.")
	
    # Split feature set to features/target field
    mFeatures, vClass, sampleIDs = splitFeatures(clinicalfile, datafile, labelfile)##return: A tuple of the form (matrix of features, matrix of labels, sample ids)


    mControlFeatureMatrix = getControlFeatureMatrix(mFeatures, vClass)#return: The subset of the data matrix, reflecting only control cases/instances.
    message("1 .This is the shape of the control matrix:")
    message(np.shape(mControlFeatureMatrix))

    if bPostProcessing:
        mFeatures = postProcessFeatures(mFeatures, mControlFeatureMatrix)
        ##return: The post-processed matrix, without NaNs.

    # Update control matrix, taking into account postprocessed data
    mControlFeatureMatrix = getControlFeatureMatrix(mFeatures, vClass)
    ##Update control matrix, taking into account postprocessed data, an exei tre3ei to bPostProcessing dn 8a exei NaNs

    message("2 .This is the shape of the control matrix:")
    message(np.shape(mControlFeatureMatrix))

    if bNormalize:
        mFeatures = normalizeDataByControl(mFeatures, mControlFeatureMatrix, bNormalizeLog2Scale)
        ##return: The normalized and - possibly - log scaled version of the input feature matrix.
    return mFeatures, vClass, sampleIDs

In [108]:
mFeatures, vClass, sampleIDs = initializeFeatureMatrices(bResetFiles=True, bPostProcessing=False, bNormalize=False, bNormalizeLog2Scale=False)

Opening files...
Trying to load saved data... Failed:
User requested file reset...
Trying to load saved data from txt...
Loading labels and ids...
This is the label file...
[['TCGA-2F-A9KO-01A' '1']
 ['TCGA-2F-A9KP-01A' '1']
 ['TCGA-2F-A9KQ-01A' '1']
 ['TCGA-2F-A9KR-01A' '1']
 ['TCGA-2F-A9KT-01A' '1']
 ['TCGA-2F-A9KW-01A' '1']
 ['TCGA-4Z-AA7M-01A' '1']
 ['TCGA-4Z-AA7N-01A' '1']
 ['TCGA-4Z-AA7O-01A' '1']
 ['TCGA-4Z-AA7Q-01A' '1']
 ['TCGA-4Z-AA7R-01A' '1']
 ['TCGA-4Z-AA7S-01A' '1']
 ['TCGA-4Z-AA7W-01A' '1']
 ['TCGA-4Z-AA7Y-01A' '1']
 ['TCGA-4Z-AA80-01A' '1']
 ['TCGA-4Z-AA81-01A' '1']
 ['TCGA-4Z-AA82-01A' '1']
 ['TCGA-4Z-AA83-01A' '1']
 ['TCGA-4Z-AA84-01A' '1']
 ['TCGA-4Z-AA86-01A' '1']
 ['TCGA-4Z-AA87-01A' '1']
 ['TCGA-4Z-AA89-01A' '1']
 ['TCGA-5N-A9KI-01A' '1']
 ['TCGA-5N-A9KM-01A' '1']
 ['TCGA-BL-A0C8-01A' '1']
 ['TCGA-BL-A0C8-01A-1' '1']
 ['TCGA-BL-A0C8-01B' '1']
 ['TCGA-BL-A13I-01A' '1']
 ['TCGA-BL-A13I-01A-1' '1']
 ['TCGA-BL-A13I-01B' '1']
 ['TCGA-BL-A13J-01A' '1']
 ['TCGA-BL-A13J-0

NameError: name 'getControlFeatureMatrix' is not defined

In [43]:
def loadPatientAndControlData():
    """
    Loads and returns the serialized patient and control feature data file as a matrix.
    :return: the patient and control feature data file as a matrix
    """
    message("Loading features...")
    fControl = open(FEATURE_VECTOR_FILENAME, "r")
    datafile = np.genfromtxt(fControl, skip_header=1, usecols=range(1, 100472),
                             missing_values=['NA', "na", '-', '--', 'n/a'], delimiter=" ",
                             dtype=np.dtype("float")
                             )
    ##numpy.genfromtxt function to read data from a file. This function is commonly used to load data from text files into a NumPy array.
    ##dtype=np.dtype("float"): This sets the data type for the resulting NumPy array to float
    fControl.close()

    message("This is the datafile...")
    message(datafile)
    message("Loading features... Done.")
    return datafile

In [7]:
def saveLoadedData(datafile, labelfile):
    """
    Saves intermediate data and label file matrices for quick loading.
    :param datafile: The matrix containing the feature data.
    :param labelfile: The matrix containing the label data.
    """
    message("Saving data in dir..." + os.getcwd())
    np.save(Prefix + "patientAndControlData.mat.npy", datafile)
    np.save(Prefix + "patientAndControlDataLabels.mat.npy", labelfile)
    message("Saving data... Done.")

## np.nan

In [107]:
def splitFeatures(clinicalfile, datafile, labelfile): 
    """
    Extracts class and instance info, returning them as separate matrices, where rows correspond to the same
    case/instance.

    :param clinicalfile: The file with the clinical info.
    :param datafile: The matrix containing the full feature data from the corresponding file.
    :param labelfile: The matrix containing  the full label data from the corresponding file.
    :return: A tuple of the form (matrix of features, matrix of labels)
    Chris update: :return: A tuple of the form (matrix of features, matrix of labels, sample ids)
    """
    message("Splitting features...")
    message("Number of features: %d"%(np.size(datafile, 1)))
    message("This is the label file:")
    message(labelfile)
    message("This is the shape of the labelfile: %s" % (str(np.shape(labelfile))))
    mFeaturesOnly = datafile[:, :]##datafile = the patient and control feature data file as a matrix
    # Create matrix with extra column (to add tumor stage)
    iFeatCount = np.shape(mFeaturesOnly)[1] + 1
    ##np.shape(mFeaturesOnly)[1] After getting the shape, you are accessing the second element of the tuple, which corresponds to the number of columns in the array
    ##number of columns + 1
    # DEBUG LINES
    message("Label file rows: %d\tFeature file rows: %d"%(np.shape(labelfile)[0], np.shape(mFeaturesOnly)[0]))
    #############

    mFeatures = np.zeros((np.shape(mFeaturesOnly)[0], iFeatCount))
    mFeatures[:, :-1] = mFeaturesOnly
    mFeatures[:, iFeatCount - 1] = np.nan##last column of the NumPy array mFeatures to np.nan
    
    #---
    #tumorStageToInt = np.vectorize(convertTumorType)##Converts tumor stages to float numbers, based on an index of classes.
    choicelist = clinicalfile[:, 1].astype(float)
    ## clinicalfile[:, 1]: This code extracts the entire second column (column with index 1) from the NumPy array clinicalfile
    ## choicelist contains the float number representations of tumor stages from the second column of clinicalfile    

    ## replace 0 (missing values) in tumor stage with nan
    choicelist = np.where(choicelist==0, np.nan, choicelist)
    print("Indices for nan")
    print(np.argwhere(np.isnan(choicelist)))
    # For every row
    for iCnt in range(np.shape(labelfile)[0]):
        condlist = clinicalfile[:, 0] == labelfile[iCnt, 0]##comparing the elements and storing the result in the condlist will be a Boolean array with True, false

        ## clinicalfile[:, 1]: This code extracts the entire second column (column with index 1) from the NumPy array clinicalfile
        ## choicelist contains the float number representations of tumor stages from the second column of clinicalfile
        # Update the last feature, by joining on ID
        mFeatures[iCnt, iFeatCount - 1] = np.select(condlist, choicelist)
        ##mFeatures[iCnt, iFeatCount - 1] iCnt is used as the row index and last column
        ##np.select will select values from choicelist based on the corresponding conditions in condlist
    vClass = labelfile[:, 1]
    sampleIDs = labelfile[:, 0]
    print("This is the vClass: ")
    print(vClass)
    # DEBUG LINES
    message("Found classes:\n%s" % (str(vClass)))
    message("Found sample IDs:\n%s" % (str(sampleIDs)))
    #############
    # DEBUG LINES
    # message("Found tumor types:\n%s" % (
    #     "\n".join(["%s:%s" % (x, y) for x, y in zip(labelfile[:, 0], mFeatures[:, iFeatCount - 1])])))
    #############
    message("Splitfeatures: This is the mFeatures...")
    message(mFeatures)
    message("Splitting features... Done.")

    return mFeatures, vClass, sampleIDs