In [1]:
# library preparation
import pandas as pd
import numpy as np
import time

In [None]:
# Custom I/O functions
def read_labels(filename):
    """
    Reads the labels.csv file identifyin which samples contain SAS or not
    Input: 
        filename: file path to labels.csv
    Output:
        y: a numpy array of labels "Yes" vs "No"
    """
    y = pd.read_csv(filename)
    y = y.sort_values(by="indices").drop("indices", axis=1).to_numpy()

    return y

In [21]:


def read_samples(samples, fileStart, filepath, keep_col=None):
    # read in training data
    # take = 251 # row 1 to row 251 (exclusive), 250 rows
    X_dir_first = "./Generate_Training_Data/output/output_1"
    X = pd.read_table(fileStart).to_numpy()
    if keep_col is not None:
        # keep columns from 1:keep_col
        X = X[:, keep_col]
    rows, columns = X.shape

    # We now concatenate all samples together
    # Note: THIS COULD TAKE A WHILE! (only run once)
    start = time.time()
    for i in range(2, samples+1):
        filename = "./Generate_Training_Data/output/output_" + str(i)
        sample = pd.read_table(filename).to_numpy()[:, keep_col]
        X = np.concatenate((X, sample), axis=0)
    end = time.time()
    print('Time taken to concatenate files:', end - start)

    # reshape to samples, rows, columns
    X = X.reshape(-1, rows, columns)

    return X

def clean_data(X, y, samples):
    """
    House keeping: ensure no element is nan or infinity, otherwise scikit-learn
    will not like that
    X Must be have 3 dimensions ONLY (otherwise indices are not accurate)
    """
    assert len(X.shape) == 3
    assert samples == X.shape[0]

    has_NAN = np.any(np.isnan(X))
    if has_NAN:
        print("We got NaNs, identifying location of NaNs...")
        indices_nan = np.argwhere(np.isnan(X))
        samples_with_nans = set(indices_nan[:, 0])
        columns_with_nans = set(indices_nan[:, 2])
        print(f"Attributes with NaNs are: {columns_with_nans}")
        # print(f"Samples with NaNs are: {samples_with_nans}")
        print(f"There are ", len(samples_with_nans), " Samples with NANs")

    has_Inf = np.any(np.isinf(X))
    if has_Inf:
        print("We got Infinities, identifying location of Infs...")
        indices_Infs = np.argwhere(np.isinf(X))
        samples_with_Infs = set(indices_Infs[:, 0])
        columns_with_Infs = set(indices_Infs[:, 2])
        print(f"Attributes with Infs are: {columns_with_Infs}")
        # print(f"Samples with Infs are: {samples_with_nans}")
        print(f"There are ", len(samples_with_Infs), " Samples with Infs")

    # Attempting to clean the dataset from NaNs and Infs
    bug_samples = set()
    bug_samples.update(samples_with_nans)
    bug_samples.update(samples_with_Infs)
    print(f"There are a total of {len(bug_samples)} containing Infs or NaNs")
    bug_sample_l = list(bug_samples)
    bug_sample_l.sort()
    print("Deleting samples ", bug_sample_l)

    threshold = 0.5
    if len(bug_samples) < threshold*samples:
        print(f"Less than {threshold*100}% of Samples, dropping.")
        X_clean = np.delete(X, list(bug_samples), axis = 0)
        y_clean = np.delete(y, list(bug_samples), axis = 0)
        samples_clean = samples - len(bug_samples)
        print(f"After dropping, we have {samples_clean} samples left")
    else:
        print("Too many Samples to drop, can't clean!")
        
    return X_clean, y_clean, samples_clean

## Training Data

We first read in `samples` training samples and 'compress' down to one dimesnion using `reshape` so that RandomForest can evaluate it correctly

The training samples consist of a 50-50 split betwee neutral samples and SAS samples of shape (460, 15) per sample. 
I choose the 50-50 ratio because **class imabalnce** is often a big headache in model fitting and for the sake of training, a 50-50 split will best enable the classifier to capture the signals of SAS. 

The attribute key is as belows:

1. Genetic diversity on the X (Pi_X)
2. Genetic diversity on the Y (Pi_Y) 
3. Total genetic diversity (Pi_tot)
4. Fst between the X and Y
5. Dxy between the X and Y
6. Da between the X and Y
7. Tajima's D on the X
8. Tajima's D on the Y
9. Tajima's D across all samples
10. Relative density of SNPs on the X
11. Relative density of SNPs on the Y
12. Relative density of SNPS across all samples
13. Average correlation between SNPs on X
14. Average correlation between SNPs on Y
15. Average correlation between SNPs across all samples

`y` will contain label information for if sample i has SAS or not of shape (`samples`, 1) with one aitribute:

1. SAS (Yes or No)

Then, we flatten both `X` and `y` to be of shape (samples, 460*15 = 6900) and (samples,) respectively

## Caution

Attribute 12-15 has been Dropped due to presence of NaNs

In [3]:
samples_train = 10000
y_train = read_labels("./Generate_Training_Data/input/label.csv")
print(y_train[:5])
print(y_train.shape)

[['No']
 ['Yes']
 ['Yes']
 ['Yes']
 ['Yes']]
(10000, 1)


In [4]:
X_train = read_samples(samples_train, 
                      "./Generate_Training_Data/output/output_1", 
                      "./Generate_Training_Data/output/output_", 
                      keep_col=list(range(0, 12)))
print(X_train[0, :, :])
print(X_train.shape)

Time taken to concatenate files: 340.7647044658661
[[4.87912088e-02 8.46244879e-02 4.35141923e-01 ... 2.67642073e-04
  5.44662309e-04 1.58810825e-03]
 [4.55037920e-02 2.80023335e-02 4.64400840e-01 ... 9.81354269e-04
  5.44662309e-04 4.51022742e-03]
 [6.71747607e-02 4.93341360e-02 4.33266278e-01 ... 1.15978232e-03
  8.16993464e-04 3.93850845e-03]
 ...
 [2.55639098e-01 2.42819155e-01 2.94519141e-01 ... 1.15978232e-03
  1.27087872e-03 1.20696227e-03]
 [2.73940345e-01 2.23941739e-01 2.90598291e-01 ... 1.78428049e-03
  1.99709513e-03 1.77868123e-03]
 [1.74978867e-01 1.69601905e-01 1.95907501e-01 ... 1.42742439e-03
  1.54320988e-03 1.65163258e-03]]
(10000, 460, 12)


In [22]:
X_train_clean, y_train_clean, samples_train = clean_data(
    X_train, y_train, samples_train)
print("\n")
print(X_train_clean.shape)
print(y_train_clean.shape)

We got NaNs, identifying location of NaNs...
Attributes with NaNs are: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
There are  3210  Samples with NANs
We got Infinities, identifying location of Infs...
Attributes with Infs are: {7}
There are  731  Samples with Infs
There are a total of 3500 containing Infs or NaNs
Deleting samples  [4, 6, 7, 8, 10, 11, 12, 14, 16, 22, 23, 26, 27, 34, 35, 42, 47, 49, 54, 60, 61, 63, 66, 67, 70, 75, 78, 84, 85, 86, 89, 91, 92, 95, 96, 101, 102, 105, 110, 111, 115, 116, 118, 122, 124, 125, 129, 131, 132, 134, 136, 138, 140, 142, 143, 145, 148, 150, 160, 161, 162, 163, 176, 177, 180, 182, 184, 187, 189, 196, 198, 201, 202, 205, 207, 210, 213, 217, 219, 223, 226, 227, 230, 231, 232, 234, 236, 237, 238, 241, 243, 246, 250, 253, 255, 259, 265, 268, 272, 274, 277, 279, 280, 282, 285, 286, 287, 288, 293, 295, 299, 301, 304, 307, 310, 313, 315, 317, 318, 320, 325, 332, 335, 339, 340, 341, 343, 345, 350, 357, 360, 374, 376, 378, 379, 380, 388, 390, 391, 397, 398, 400, 

After dropping, we have 6500 samples left


(6500, 460, 12)
(6500, 1)


In [6]:
# store the result into a numpy object
np.savez_compressed('./data/train.npz', 
                    X_train=X_train_clean, y_train=y_train_clean)

## Small Test Data Batch

I then asked the simulation for a 80-20 split, 1000 sample test set that's not used in the model building process at all for an earnest attempt at out-of-sample prediction error.

## Caution

Attribute 12-15 has been Dropped due to presence of NaNs

In [7]:
samples_test = 1000
y_test = read_labels("./Generate_Test_Data/input/label.csv")
print(y_test[:5])
print(y_test.shape)

[['Yes']
 ['Yes']
 ['No']
 ['No']
 ['Yes']]
(1000, 1)


In [8]:
X_test = read_samples(samples_test, 
                      "./Generate_Test_Data/output/output_1", 
                      "./Generate_Test_Data/output/output_", 
                      keep_col=list(range(0, 12)))
print(X_test[0, :, :])
print(X_test.shape)

Time taken to concatenate files: 3.7436068058013916
[[0.0782967  0.05133263 0.45040038 ... 0.00115246 0.00069284 0.00391538]
 [0.03285659 0.04828878 0.470051   ... 0.00189818 0.00200154 0.01082169]
 [0.13103352 0.04828986 0.47686226 ... 0.01403295 0.00230947 0.01288814]
 ...
 [0.19491035 0.21330522 0.21085953 ... 0.00101688 0.00107775 0.00103323]
 [0.15855573 0.13472918 0.15447927 ... 0.00074571 0.00061586 0.00076132]
 [0.19194139 0.20391443 0.20721044 ... 0.00067792 0.00092379 0.00081571]]
(1000, 460, 12)


In [9]:
X_test_clean, y_test_clean, samples_test = clean_data(
    X_test, y_test, samples_test)
print("\n")
print(X_test_clean.shape)
print(y_test_clean.shape)

We got NaNs, identifying location of NaNs...
Attributes with NaNs are: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
There are  321  Samples with NANs
We got Infinities, identifying location of Infs...
Attributes with Infs are: {7}
There are  85  Samples with Infs
There are a total of 353 containing Infs or NaNs
Less than 50.0% of Samples, dropping.
After dropping, we have 647 samples left


(647, 460, 12)
(647, 1)


In [10]:
# store the result into a numpy object
np.savez_compressed('./data/test.npz', X_test=X_test_clean, y_test=y_test_clean)

## Large Test Data Batch

I then asked the simulation for a 80-20 split, 10000 sample test set that's not used in the model building process at all for an earnest attempt at out-of-sample prediction error.

## Caution

Attribute 12-15 has been Dropped due to presence of NaNs

In [12]:
samples_test_large = 10000
y_test_large = read_labels("./Generate_Test_More_Data/input/label.csv")
print(y_test_large[:5])
print(y_test_large.shape)

[['No']
 ['Yes']
 ['No']
 ['Yes']
 ['No']]
(10000, 1)


In [13]:
X_test_large = read_samples(samples_test_large, 
                      "./Generate_Test_More_Data/output/output_1", 
                      "./Generate_Test_More_Data/output/output_", 
                      keep_col=list(range(0, 12)))
print(X_test_large[0, :, :])
print(X_test_large.shape)

Time taken to concatenate files: 310.8872334957123
[[7.65567766e-02 0.00000000e+00 4.39238859e-01 ... 6.36435959e-04
  0.00000000e+00 1.89334175e-03]
 [5.92289067e-02 1.03685303e-02 4.56698912e-01 ... 9.54653938e-04
  2.70929287e-04 3.72357210e-03]
 [1.99537305e-02 1.77754354e-02 5.08410945e-01 ... 3.18217979e-04
  5.41858575e-04 4.79646576e-03]
 ...
 [1.86813187e-01 1.45289030e-01 1.74907671e-01 ... 6.36435959e-04
  6.77323219e-04 6.31113916e-04]
 [9.15750916e-02 2.54893036e-01 1.86134853e-01 ... 2.38663484e-04
  5.41858575e-04 3.78668350e-04]
 [2.26059655e-01 1.20163860e-01 1.80647884e-01 ... 5.56881464e-04
  2.70929287e-04 4.41779741e-04]]
(10000, 460, 12)


In [14]:
X_test_large_clean, y_test_large_clean, samples_test_large = clean_data(
    X_test_large, y_test_large, samples_test_large)
print("\n")
print(X_test_large_clean.shape)
print(y_test_large_clean.shape)

We got NaNs, identifying location of NaNs...
Attributes with NaNs are: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
There are  3211  Samples with NANs
We got Infinities, identifying location of Infs...
Attributes with Infs are: {7}
There are  732  Samples with Infs
There are a total of 3501 containing Infs or NaNs
Less than 50.0% of Samples, dropping.
After dropping, we have 6499 samples left


(6499, 460, 12)
(6499, 1)


In [16]:
# store the result into a numpy object
np.savez_compressed('./data/test_large.npz', 
                    X_test_large=X_test_large_clean, 
                    y_test_large=y_test_large_clean)