In [1]:
# Input data.
# Since the classes are unequal, we don't risk random sampling from full set.
# Instead, keep them separate until we make the train/test split.
infile_pos='score-positive.4mer.features.csv'
infile_neg='score-negative.4mer.features.csv'
infile_zero='score-zero.4mer.features.csv'
import numpy as np
import pandas as pd
raw_pos = pd.read_csv(infile_pos,header=0)
raw_neg = pd.read_csv(infile_neg,header=0)
raw_zero = pd.read_csv(infile_zero,header=0)
raw_pos.head()

Unnamed: 0,label,seqname,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,1,ENSG00000261061,31,5,2,13,6,5,0,5,...,0,2,5,5,2,4,2,3,4,7
1,1,ENSG00000254837,87,19,28,42,31,9,1,15,...,1,15,13,13,6,14,17,11,10,30
2,1,ENSG00000282851,105,29,43,51,48,23,2,23,...,3,21,21,20,14,27,28,23,22,62
3,1,ENSG00000255650,108,32,51,55,55,25,3,26,...,3,24,21,21,19,30,28,28,25,83
4,1,ENSG00000163597,133,47,74,80,65,39,8,42,...,6,45,37,32,37,61,66,55,51,169


In [2]:
raw_pos.shape, raw_neg.shape, raw_zero.shape
# All matrices have columns for 4^4 = 256 k-mers plus label plus seqname.

((170, 258), (577, 258), (405, 258))

In [3]:
# The data is imbalanced: 170/577/405.
# What is the probability of correct classification by random?
total=170+577+405
prob_random_guess_is_correct=pow(170/total,2)+pow(577/total,2)+pow(405/total,2)
prob_random_guess_is_correct

0.3962417414158951

In [4]:
def split_train_test(dataset,train_portion):    
    # Use pandas sample() to randomize the order (i.e. random sample without replacement).
    # Side note. Data frame can be randomized while keeping header in place.
    # This is repeatable with 42.
    middle = int(len(dataset)*train_portion)
    random = dataset.sample(frac=1,random_state=42)
    train_set = random[:middle]
    test_set = random[middle:]
    return train_set,test_set
TRAIN_PORTION=0.8

train_pos , test_pos = split_train_test(raw_pos,TRAIN_PORTION)
train_neg , test_neg = split_train_test(raw_neg,TRAIN_PORTION)
train_zero , test_zero = split_train_test(raw_zero,TRAIN_PORTION)

raw_pos.shape, train_pos.shape, test_pos.shape

((170, 258), (136, 258), (34, 258))

In [5]:
# The Geron book recommends this (or StrattifiedShuffleSplit) 
# but we could not make it work. 

#from sklearn.model_selection import ShuffleSplit
#splitter=ShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
#for train_index,test_index in splitter.split(rawpos):
#    train_pos = rawpos[train_index]
#    test_pos = rawpos[test_index]

In [6]:
# Combine and shuffle.
train_sorted = pd.concat([train_pos, train_neg, train_zero],axis=0)
train_set = train_sorted.sample(frac=1,random_state=17)

test_sorted = pd.concat([test_pos, test_neg, test_zero],axis=0)
test_set = test_sorted.sample(frac=1,random_state=17)

train_set.shape, test_set.shape

((921, 258), (231, 258))

In [7]:
y_train = train_set['label']
X_train_ID = train_set['seqname']
X_train = train_set.drop(['label','seqname'],axis=1)
X_train.shape
X_train_ID.head()
X_train.head()
y_train.unique()
# Try all the validators above to ensure we have what we need.

array([ 0, -1,  1])

In [8]:
# Feature Scaling.
# Effectively convert k-mer counts to k-mer frequencies.
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled = scaler.fit_transform(X_train)
df = pd.DataFrame(scaled,columns=X_train.columns)
# The above works but we don't want to standaridize each column (feature).

In [9]:
# We want to normalize each number by its row sum.
X_train.head()
X_train.sum(axis=1)
# Row sums before normalization.

165     363946
61      149219
505    1239798
195     455718
28       68188
        ...   
37       95573
220     516018
29       84581
421    1066593
84      199374
Length: 921, dtype: int64

In [10]:
# https://stackoverflow.com/questions/35678874/normalize-rows-of-pandas-data-frame-by-their-sums/35679163
X_norm=X_train.div(X_train.sum(axis=1), axis=0)
X_norm.sum(axis=1)
# Row sums after normalization.

165    1.0
61     1.0
505    1.0
195    1.0
28     1.0
      ... 
37     1.0
220    1.0
29     1.0
421    1.0
84     1.0
Length: 921, dtype: float64

In [None]:
# TO DO Try min-max scaling and z-score scaling.

In [11]:
X_norm  # starting now, each step must be repeated on X_test
X_norm[:5]
# Review the data before training

Unnamed: 0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
165,0.011397,0.004957,0.006232,0.007732,0.005144,0.003514,0.001003,0.004209,0.006127,0.004034,...,0.0008,0.006792,0.004556,0.003836,0.004734,0.004979,0.006399,0.006506,0.006603,0.012296
61,0.011171,0.004899,0.006186,0.007506,0.004952,0.003404,0.001092,0.003974,0.005804,0.004068,...,0.000891,0.006474,0.004591,0.003733,0.004832,0.004792,0.006253,0.006454,0.006681,0.013021
505,0.009708,0.004292,0.00556,0.006187,0.004473,0.00326,0.000906,0.003681,0.005422,0.00389,...,0.000955,0.006198,0.003885,0.003597,0.004584,0.004606,0.005311,0.005934,0.00587,0.0109
195,0.010296,0.004465,0.005574,0.006276,0.004599,0.003285,0.000946,0.003524,0.005462,0.003976,...,0.000983,0.006081,0.003853,0.003597,0.004661,0.004573,0.00519,0.005933,0.005833,0.0112
28,0.010735,0.00481,0.005851,0.007083,0.004869,0.003637,0.001012,0.003622,0.005529,0.004106,...,0.000792,0.006981,0.004429,0.003608,0.004517,0.004341,0.005719,0.006467,0.006423,0.012935


In [35]:
# TO DO: try this with a numpy array to ensure model doesn't see the row id like 165.
# Why is 195 a negative? Perhaps the number is the number after shuffle.
y_train[:10]
# Review the data before training

165    0
61     0
505   -1
195   -1
28     0
139    0
45     0
77     0
140    1
374    0
Name: label, dtype: int64

In [38]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=40) 
sgd_accuracy=cross_val_score(sgd, X_norm, y_train, cv=3, scoring="accuracy")
sgd_accuracy
# Accuracy is slightly better than chance (differs by class) using normalized data.
# TO DO: try grid search to optimize hyper parameters.

array([0.50162866, 0.50162866, 0.49837134])

In [37]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=41)
rfc_accuracy = cross_val_score(rfc,X_norm, y_train, cv=3, scoring="accuracy")
rfc_accuracy
# Accuracy is perfect using normalized data.

array([1.        , 1.        , 0.99674267])

In [24]:
svm = SVC()
svm_accuracy=cross_val_score(svm, X_norm, y_train, cv=3, scoring="accuracy")
svm_accuracy
# Accuracy is nearly perfect using normalized data. How did that happen?

array([0.99674267, 1.        , 0.98697068])

In [36]:
from sklearn.svm import SVC
svm = SVC()
svm_accuracy=cross_val_score(svm, X_train, y_train, cv=3, scoring="accuracy")
svm_accuracy
# Accuracy is still pretty good using not-normalized data.

array([0.77198697, 0.7752443 , 0.7980456 ])

In [25]:
svm.fit(X_norm, y_train) # train on the full set (not cross validation fraction)
svm.dual_coef_
svm.classes_
svm.support_vectors_
# No signs of trouble (e.g. if the first or last column were 100%, maybe it contained the label)

array([[0.00989822, 0.00431689, 0.00559997, ..., 0.00600105, 0.00595757,
        0.01107305],
       [0.00994656, 0.00431837, 0.00560762, ..., 0.00602272, 0.00597691,
        0.01114138],
       [0.01196311, 0.00480032, 0.00591872, ..., 0.00643394, 0.00575536,
        0.01335796],
       ...,
       [0.01206515, 0.00529931, 0.00666697, ..., 0.00628469, 0.00657799,
        0.0120256 ],
       [0.01209814, 0.00526933, 0.0066422 , ..., 0.00584247, 0.00644671,
        0.01176491],
       [0.01195536, 0.00523076, 0.0065766 , ..., 0.00587824, 0.00649336,
        0.01181661]])

In [29]:
y_pred = svm.predict(X_norm)

In [34]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train,y_pred)
rmse = np.sqrt(mse)
rmse

0.0

In [None]:
# TO DO: Maybe problem was easy given the zeros. 
# But we really want classes 1 and -1.
# How would the model do on just the zeros forcing them to score as 1 or -1.
# Or go back to the RNA and classify as + or - without a zero class.
# The visual clustering provided visual intuition of but now we're getting down to individual cases. 

In [None]:
# Compare traditional ML to deep learning.
# Not sure if we want feature selection, but it could extract k-mers indicative of biology.
# Compare to other data sets.

In [None]:
# Go back to the LncAtlas scores rather than 3 bins.
# Try KNN on scores.
# Try logistic regression. Try thresholds. Add inverse log as another feature?

In [None]:
# What did LncADeep use if they didn't use the LncAtlas.
# Try Google collaborate on cloud.
# Can I use Xcede? 
# Price a desktop GPU?
# 8-cluster GPU at WVU called Titan.

In [None]:
# If the training score was 1, the model probably overfit!