In [1]:
import pandas as pd
import numpy as np

In [2]:
### CSV File containing 965 gene dataset for AD classifier
myDF = pd.read_csv("data/AD_sort_by_AD_over_NCI_v3_pop.csv")

### Basic parameters numVars represents the number of genes and numRuns is the number of scores desired 
### for each group in the simulation
numVars = len(myDF.loc[:, "Coef"])
numRuns = 1000000

In [3]:
### Finding the mean TPM count for each gene in the AD and NCI groups
### In the dataframe, the rows of the "AD average TPM" and "NCI average TPM" columns represent 
### the average TPM count for a different gene for the group with Alzeheimers and the control group respectively. 
ADmeans = myDF.loc[:, "AD average TPM"]
Ctrlmeans = myDF.loc[:, "NCI average TPM"]

### Finding the standard deviation TPM count for each gene in both groups
### Use 0.1 for 10% uncertainty and 0.2 for 20% uncertainty in average TPM count for each gene in each group
ADstds = [0.1 * val for val in ADmeans]
Ctrlstds = [0.1 * val for val in Ctrlmeans]

### Setting up numpy array of coefficient that represent the score function(linear combination)
coefficients = np.array(myDF.loc[:, "Coef"])

In [None]:
### The Simulation function generates a score from linear combination function based on mean and standard deviation for each gene
### In order to improve speed, I changed my score generation function from what it was in previous demos. 
### I used np.arrays and a nice trick involving a feature of np.random.normal, because numpy arrays tend to be more memory efficient 
### and speed efficient than using python lists with for loops. Using this technique greatly improved runtime. 

### np.random.normal(means, stds, size=(1, numVars)) creates a numpy array of "numVars" random numbers from normal distributions. 
### It works through the means and stds arrays and one random number generated from the normal distribution formed 
### by one mean-standard deviation pair.

### Then this numpy array is multiplied by the coefficient numpy array and all of the elements in the resulting array 
### are added together to generate the final score.
def Simulation(means, stds, coefficients):
    sampler = qmc.LatinHypercube(d=1)
    sample = sampler.random(n=len(coefficients))
    l_bounds = means - stds
    u_bounds = means + stds
    sims = qmc.scale(sample, l_bounds, u_bounds)
    return np.sum(np.multiply(coefficients, sims))