In [1]:
import numpy as np
import pandas as pd
from scipy import stats
pd.options.display.max_rows = 8 # will use 8 by default for count, mean, std ... max
pd.options.display.max_columns = 9
pd.options.display.float_format = '{:.6f}'.format
pd.set_option('mode.chained_assignment', None)

In [2]:
df = pd.read_csv("Dataset_Github_Labeled.csv")

# change y in the csv file to be assigned to one of three classes: High-grade, Low-grade, Normal
for i in range (0,324): # 0 - 323, same size as x
    #print(type(y[i]))
    if df['class'][i].startswith('High-grade'):  # if the last column contains text "High-grade", etc below.
        df['class'][i] = 'High-grade'
    elif df['class'][i].startswith('Low-grade'):
        df['class'][i] = 'Low-grade'
    elif df['class'][i].startswith('Normal'):
        df['class'][i] = 'Normal'

# https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
# x represents attributes, y represents class label
training, validation, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))]) # 60% test, 20% validation, 20% test split.
x_train = training.drop(['class'], axis=1)
y_train = training['class']
x_validation=validation.drop(['class'], axis=1)
y_validation=validation['class']
x_test=test.drop(['class'], axis=1)
y_test=test['class']
# Encode class label y to be 0, 1, or 2
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()
y_train= lbl_encoder.fit_transform(y_train)
y_test= lbl_encoder.fit_transform(y_test)
y_validation= lbl_encoder.fit_transform(y_validation)
del training, validation, test, df # clear memory of variables not needed

In [3]:
x_train

Unnamed: 0,0,1,2,3,...,1363,1364,1365,1366
240,0.185261,0.191619,0.188896,0.186032,...,0.004904,0.003729,0.003020,0.003592
292,0.176679,0.169499,0.158715,0.148734,...,0.088525,0.082285,0.076230,0.072467
297,0.095070,0.094398,0.092879,0.094071,...,0.106292,0.107712,0.078201,0.060033
109,0.087070,0.087987,0.088354,0.089176,...,0.025964,0.028441,0.035309,0.041561
...,...,...,...,...,...,...,...,...,...
51,0.181511,0.178195,0.171685,0.164226,...,0.069583,0.073984,0.079793,0.085796
105,0.158372,0.149979,0.134043,0.112871,...,0.106559,0.112164,0.119199,0.125960
321,0.303267,0.423320,0.521487,0.512847,...,0.062437,0.047654,0.042201,0.038945
67,0.118027,0.141175,0.165823,0.164317,...,0.016706,0.021750,0.028117,0.033643


In [4]:
y_train

array([2, 2, 2, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 2, 0, 0, 0, 1, 2, 1, 0, 2,
       0, 2, 1, 1, 1, 1, 1, 2, 2, 1, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 2, 0,
       0, 1, 0, 0, 1, 1, 0, 2, 1, 1, 2, 1, 1, 2, 0, 2, 1, 0, 0, 1, 2, 1,
       0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 1, 2, 0, 0, 0,
       1, 2, 0, 0, 1, 1, 1, 2, 0, 0, 2, 1, 2, 2, 0, 2, 0, 2, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 2, 1, 0, 0, 1, 1, 1, 1, 1, 2, 1, 0, 0, 2, 0, 0,
       1, 2, 2, 2, 1, 0, 0, 1, 2, 1, 2, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0,
       0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 1, 1, 0, 2, 0, 0, 2, 0])

In [5]:
stats.ttest_ind(y_train, x_train['0'], equal_var=False)

Ttest_indResult(statistic=11.541007125051365, pvalue=3.784058400678618e-24)

In [6]:
all_t_tests=[]
for col in x_train:
    if col != 'class':
        t_test = stats.ttest_ind(y_train, x_train[str(col)], equal_var=False)
        if t_test[1] < 0.05 and t_test[0] > 9: # if p value > 5% and t-score passes a small threshold for the data
            t={}
            t['t_score']=t_test[0]
            t['feature']=col
            all_t_tests.append(t)


In [7]:
# note that positive or negative values are just the first mean - the second mean in stats.ttest_ind(first, second)
num_features_to_include = 1230      

print(len(all_t_tests))
all_t_tests=sorted(all_t_tests, key=lambda k: k['t_score'], reverse = True) # reverse = True orders from top down
if len(all_t_tests) > num_features_to_include:
    all_t_tests = all_t_tests[0:num_features_to_include]
print(len(all_t_tests))
# all_t_tests

1283
1230


In [8]:
df_selected_features = pd.DataFrame()
for elem in all_t_tests:
    df_selected_features[str(elem['feature'])] = x_train [str(elem['feature'])]
# df_selected_features    

In [9]:
# convert to csv file
aList=[]
for row in y_train:
    if row == 0:  # if the last column contains text "High-grade", etc below.
        aList.append('High-grade')
    elif row == 1:
        aList.append('Low-grade')
    elif row == 2:
        aList.append('Normal')

In [10]:
df_selected_features['class']=aList

In [11]:
df_selected_features

Unnamed: 0,234,235,44,43,...,301,409,300,class
240,-0.016631,-0.008724,-0.029845,-0.042120,...,0.282398,0.262772,0.269717,Normal
292,-0.001235,-0.003980,0.010641,0.009660,...,0.245884,0.258441,0.244822,Normal
297,-0.012433,-0.000666,0.010767,0.009925,...,0.276964,0.272121,0.284743,Normal
109,0.018620,0.015621,0.048160,0.048455,...,0.197527,0.228751,0.201370,High-grade
...,...,...,...,...,...,...,...,...,...
51,0.007468,0.008839,0.003011,-0.001438,...,0.279253,0.285191,0.283804,High-grade
105,0.014513,0.010477,0.036271,0.044282,...,0.229092,0.268506,0.231581,High-grade
321,-0.051233,-0.049310,-0.186476,-0.173585,...,0.344037,0.304974,0.344570,Normal
67,-0.015308,-0.016926,-0.012612,-0.023772,...,0.260186,0.249625,0.262828,High-grade


In [12]:
df_selected_features.to_csv(r'C:\Users\R-k-l\AppData\Local\Programs\Python\Python37\Scripts\Capstone\t_test_features\t_test_features_1230.csv', index=False)