Load Dataset

In [1]:
from utils import load_dataset

normalFilePath = '../data/BC-TCGA-Normal.txt'
tumorFilePath = '../data/BC-TCGA-Tumor.txt'

samples, normalSampleIndexes, tumorSampleIndexes = load_dataset(normalFilePath, tumorFilePath)

Data preprocessing

In [2]:
from utils import zScoreNormalization

zScoreNormalization(samples)

Prepare training and testing datasets

In [3]:
import pandas as pd
from utils import getTraningAndTestingSamples, getClassificationErrorSamples

numberOfNormalTrainingSamples = 30
numberOfTumorTrainingSamples = 30

numberOfNormalTestingSamples = 30
numberOfTumorTestingSamples = 30

# Generating Training Data and Testing Data for Normal People
normalTrainingSamples, normalTestingSamples = getTraningAndTestingSamples(samples, normalSampleIndexes,
                                                                          numberOfNormalTrainingSamples,
                                                                          numberOfNormalTestingSamples)
# Generating Training Data and Testing Data for Tumor People
tumorTrainingSamples, tumorTestingSamples = getTraningAndTestingSamples(samples, tumorSampleIndexes,
                                                                        numberOfTumorTrainingSamples,
                                                                        numberOfTumorTestingSamples)
# Generating Training Data and Testing Data
trainingSamples = normalTrainingSamples + tumorTrainingSamples
testingSamples = normalTestingSamples + tumorTestingSamples
# Generate Labels for Samples
trainingSampleLabels = [0] * numberOfNormalTrainingSamples + [1] * numberOfTumorTrainingSamples
testingSampleLabels = [0] * numberOfNormalTestingSamples + [1] * numberOfTumorTestingSamples

trainingSamplesDF = pd.DataFrame(data=trainingSamples)
testingSamplesDF = pd.DataFrame(data=testingSamples)


1. No feature selection

In [4]:
trainingErrorSamples, tp, fp, fn, tn = getClassificationErrorSamples(trainingSamples, trainingSampleLabels,
                                                                     testingSamples, testingSampleLabels)

print('[Result] No of Features: {}'.format(repr(len(testingSamples[0]))))
print('[Result] Training Error Samples: {}'.format(repr(trainingErrorSamples)))
print('[Result] Testing Error Samples: {}'.format(repr(fp + fn)))
print('[Result] TP = {}, FP = {}, FN = {}, TN = {}'.format(repr(tp), repr(fp), repr(fn), repr(tn)))
print('[Result] Accuracy = {}'.format(repr((tp+tn)/(tp+tn+fp+fn))))

[Result] No of Features: 17814
[Result] Training Error Samples: 0
[Result] Testing Error Samples: 1
[Result] TP = 29, FP = 0, FN = 1, TN = 30
[Result] Accuracy = 0.9833333333333333


2. Random Projection + Principal Component Analysis

In [5]:
from sklearn import random_projection
from sklearn.decomposition import PCA

targetDimensionality = 50
pcaTransformer = PCA(n_components=targetDimensionality)
rpTransformer = random_projection.SparseRandomProjection(n_components=targetDimensionality)
trainingSamples1 = rpTransformer.fit_transform(trainingSamples)
trainingSamples1 = pcaTransformer.fit_transform(trainingSamples1)
# Apply Random Projection to Testing Data
testingSamples1 = rpTransformer.transform(testingSamples)
testingSamples1 = pcaTransformer.transform(testingSamples1)

trainingErrorSamples, tp, fp, fn, tn = getClassificationErrorSamples(trainingSamples1, trainingSampleLabels,
                                                                     testingSamples1, testingSampleLabels)

print('[Result] No of Features: {}'.format(repr(len(testingSamples1[0]))))
print('[Result] Training Error Samples: {}'.format(repr(trainingErrorSamples)))
print('[Result] Testing Error Samples: {}'.format(repr(fp + fn)))
print('[Result] TP = {}, FP = {}, FN = {}, TN = {}'.format(repr(tp), repr(fp), repr(fn), repr(tn)))
print('[Result] Accuracy = {}'.format(repr((tp+tn)/(tp+tn+fp+fn))))

[Result] No of Features: 50
[Result] Training Error Samples: 0
[Result] Testing Error Samples: 1
[Result] TP = 29, FP = 0, FN = 1, TN = 30
[Result] Accuracy = 0.9833333333333333
