In [1]:
# this function will select X best genes (number_of_genes) from (data) in terms of chi square score
# and split the data based on test_percentage (default 0.1)
# with a random seed value (default 43)

# will also one-hot encode the output data 
# returns x and y arrays for training and testing, total of 4

#changelog
# v.2.0

# changed from tensorflow onehot to sklearn onehot since tensorflow isn't compatible with all cpu

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [1]:
def select(data, number_of_genes, test_percentage=0.1, seed=43):
    #convert characteristics into numbers for training

    #define dictionary
    infection = {'non-infectious illness': 0, 'bacterial': 1, 'viral': 2}
    #create numpy array 
    characteristics = np.array([infection[item] for item in data.Sample_characteristics_ch1], dtype=float)


    #has to be one-hot encoded for categorical crossentropy to work....
    characteristics = characteristics.reshape(-1, 1)
    ohe = OneHotEncoder()
    characteristics = ohe.fit_transform(characteristics).toarray()
    
    #returns list of keys in data which represent genes (with their affymetrix genechip scanner names)
    genes = [gene for gene in data.keys() if not('Sample' in gene or 'ID_REF' in gene)]
           
    #take only dict entries related to those keys
    expression = np.array(data[genes], dtype=float)
    expression = np.array([gene for gene in expression if any([i in str(gene) for i in '0123456789'])], dtype=float)
    
    # here we choose the best X number of genes which contribute to the diagnosis

    test = SelectKBest(score_func=chi2, k=number_of_genes)
    fit = test.fit(expression,characteristics)
    # print(fit.scores_)
    # print(len(fit.scores_))
    features = fit.transform(expression)
    
    # print WHICH genes it is
    dfscores = pd.DataFrame(fit.scores_)

    dfcolumns = pd.DataFrame(genes)
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Gene','Score']  #naming the dataframe columns
    print(featureScores.nlargest(number_of_genes,'Score'))  #print 20 best features
           
    #splitty
    x_train, x_test, y_train, y_test = train_test_split(features, characteristics, test_size=test_percentage, random_state=seed)
    
    #transpose these to be (nx,m) nx = features, m = samples
    x_train, x_test, y_train, y_test = x_train.T, x_test.T, y_train.T, y_test.T
    
    return x_train, y_train, x_test, y_test
