In [1]:
# Input data.
# Since the classes are unequal, we don't risk random sampling from full set.
# Instead, keep them separate until we make the train/test split.
infile_pos='score-positive.4mer.features.csv'
infile_neg='score-negative.4mer.features.csv'
infile_zero='score-zero.4mer.features.csv'
import numpy as np
import pandas as pd
raw_pos = pd.read_csv(infile_pos,header=0)
raw_neg = pd.read_csv(infile_neg,header=0)
raw_zero = pd.read_csv(infile_zero,header=0)
raw_pos.head()

Unnamed: 0,label,seqname,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,1,ENSG00000261061.1,31,5,2,13,6,5,0,5,...,0,2,5,5,2,4,2,3,4,7
1,1,ENSG00000254837.2,56,14,26,29,25,4,1,10,...,1,13,8,8,4,10,15,8,6,23
2,1,ENSG00000282851.2,18,10,15,9,17,14,1,8,...,2,6,8,7,8,13,11,12,12,32
3,1,ENSG00000255650.6,3,3,8,4,7,2,1,3,...,0,3,0,1,5,3,0,5,3,21
4,1,ENSG00000163597.15,25,15,23,25,10,14,5,16,...,3,21,16,11,18,31,38,27,26,86


In [2]:
raw_pos.shape, raw_neg.shape, raw_zero.shape
# All matrices have columns for 4^4 = 256 k-mers plus label plus seqname.

((116, 258), (703, 258), (333, 258))

In [3]:
# The data is imbalanced: 170/577/405.
# What is the probability of correct classification by random?
total=170+577+405
prob_random_guess_is_correct=pow(170/total,2)+pow(577/total,2)+pow(405/total,2)
prob_random_guess_is_correct

0.3962417414158951

In [4]:
def split_train_test(dataset,train_portion):    
    # Use pandas sample() to randomize the order (i.e. random sample without replacement).
    # Side note. Data frame can be randomized while keeping header in place.
    # This is repeatable with 42.
    middle = int(len(dataset)*train_portion)
    random = dataset.sample(frac=1,random_state=42)
    train_set = random[:middle]
    test_set = random[middle:]
    return train_set,test_set
TRAIN_PORTION=0.8

train_pos , test_pos = split_train_test(raw_pos,TRAIN_PORTION)
train_neg , test_neg = split_train_test(raw_neg,TRAIN_PORTION)
train_zero , test_zero = split_train_test(raw_zero,TRAIN_PORTION)

raw_pos.shape, train_pos.shape, test_pos.shape

((116, 258), (92, 258), (24, 258))

In [5]:
# The Geron book recommends this (or StrattifiedShuffleSplit) 
# but we could not make it work. 

#from sklearn.model_selection import ShuffleSplit
#splitter=ShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
#for train_index,test_index in splitter.split(rawpos):
#    train_pos = rawpos[train_index]
#    test_pos = rawpos[test_index]

In [6]:
# Combine and shuffle.
train_sorted = pd.concat([train_pos, train_neg, train_zero],axis=0)
train_set = train_sorted.sample(frac=1,random_state=17)

test_sorted = pd.concat([test_pos, test_neg, test_zero],axis=0)
test_set = test_sorted.sample(frac=1,random_state=17)

train_set.shape, test_set.shape

((920, 258), (232, 258))

In [7]:
y_train = train_set['label']
X_train_ID = train_set['seqname']
X_train = train_set.drop(['label','seqname'],axis=1)
X_train.shape
X_train_ID.head()
X_train.head()
y_train.unique()
# Try all the validators above to ensure we have what we need.

array([ 0, -1,  1])

In [8]:
# Feature Scaling.
# Effectively convert k-mer counts to k-mer frequencies.
#from sklearn.preprocessing import StandardScaler
#scaler=StandardScaler()
#scaled = scaler.fit_transform(X_train)
#df = pd.DataFrame(scaled,columns=X_train.columns)
# The above works but we don't want to standaridize each column (feature).

In [9]:
# We want to normalize each number by its row sum.
X_train.head()
X_train.sum(axis=1)
# Row sums before normalization.

238    4481
314    4197
298    1863
676    1103
123    2047
       ... 
356    1460
480    9727
599    2000
576    1107
533     688
Length: 920, dtype: int64

In [10]:
# https://stackoverflow.com/questions/35678874/normalize-rows-of-pandas-data-frame-by-their-sums/35679163
X_norm=X_train.div(X_train.sum(axis=1), axis=0)
X_norm.sum(axis=1)
# Row sums after normalization.

238    1.0
314    1.0
298    1.0
676    1.0
123    1.0
      ... 
356    1.0
480    1.0
599    1.0
576    1.0
533    1.0
Length: 920, dtype: float64

In [11]:
# TO DO Try min-max scaling and z-score scaling.

In [12]:
X_norm  # starting now, each step must be repeated on X_test
X_norm[:5]
# Review the data before training

Unnamed: 0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
238,0.003347,0.003347,0.00491,0.002455,0.003124,0.003571,0.001339,0.002008,0.004463,0.003571,...,0.000446,0.008257,0.002232,0.004017,0.004463,0.006472,0.004686,0.006918,0.005356,0.014729
314,0.014058,0.004289,0.006195,0.009054,0.007624,0.002383,0.000238,0.004051,0.007386,0.004051,...,0.001191,0.00548,0.00548,0.00548,0.004765,0.003336,0.004051,0.004765,0.00548,0.005004
298,0.009662,0.003757,0.004294,0.005904,0.005904,0.004294,0.001074,0.00161,0.010199,0.002684,...,0.001074,0.003221,0.003221,0.002684,0.006441,0.002147,0.000537,0.002684,0.005368,0.002684
676,0.013599,0.004533,0.00816,0.00272,0.00544,0.006346,0.000907,0.00544,0.00272,0.003626,...,0.001813,0.00816,0.003626,0.000907,0.000907,0.003626,0.004533,0.00816,0.004533,0.004533
123,0.024915,0.00977,0.010259,0.010259,0.004397,0.004397,0.001954,0.004397,0.005374,0.001954,...,0.001466,0.006839,0.006839,0.002443,0.005862,0.005374,0.008305,0.006839,0.007328,0.008305


In [13]:
# TO DO: try this with a numpy array to ensure model doesn't see the row id like 165.
# Why is 195 a negative? Perhaps the number is the number after shuffle.
y_train[:10]
# Review the data before training

238    0
314    0
298   -1
676   -1
123    0
359   -1
92     0
44     0
604   -1
315    0
Name: label, dtype: int64

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=40) 
sgd_accuracy=cross_val_score(sgd, X_norm, y_train, cv=3, scoring="accuracy")
sgd_accuracy
# Accuracy is slightly better than chance (differs by class) using normalized data.
# TO DO: try grid search to optimize hyper parameters.

array([0.60912052, 0.60912052, 0.61437908])

In [15]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=41)
rfc_accuracy = cross_val_score(rfc,X_norm, y_train, cv=3, scoring="accuracy")
rfc_accuracy
# Accuracy is perfect using normalized data.

array([0.5504886 , 0.55700326, 0.62091503])

In [16]:
from sklearn.svm import SVC #Support Vector Machine
svm = SVC()
svm_accuracy=cross_val_score(svm, X_norm, y_train, cv=3, scoring="accuracy")
svm_accuracy
# Accuracy is nearly perfect using normalized data. How did that happen?

array([0.58631922, 0.60260586, 0.61437908])

In [17]:
from sklearn.svm import SVC
svm = SVC()
svm_accuracy=cross_val_score(svm, X_train, y_train, cv=3, scoring="accuracy")
svm_accuracy
# Accuracy is still pretty good using not-normalized data.

array([0.60586319, 0.60912052, 0.61437908])

In [18]:
svm.fit(X_norm, y_train) # train on the full set (not cross validation fraction)
svm.dual_coef_
svm.classes_
svm.support_vectors_
# No signs of trouble (e.g. if the first or last column were 100%, maybe it contained the label)

array([[0.00966184, 0.00375738, 0.00429415, ..., 0.00268384, 0.00536769,
        0.00268384],
       [0.01359927, 0.00453309, 0.00815956, ..., 0.00815956, 0.00453309,
        0.00453309],
       [0.00128866, 0.00579897, 0.00128866, ..., 0.00451031, 0.00386598,
        0.00322165],
       ...,
       [0.01076923, 0.01076923, 0.00461538, ..., 0.00923077, 0.00307692,
        0.00923077],
       [0.00951475, 0.00951475, 0.00856327, ..., 0.0076118 , 0.0114177 ,
        0.0076118 ],
       [0.00253165, 0.00506329, 0.00421941, ..., 0.01350211, 0.01687764,
        0.01772152]])

In [19]:
y_pred = svm.predict(X_norm)

In [20]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train,y_pred)
rmse = np.sqrt(mse)
rmse

0.7766986993456455

In [21]:
# TO DO: Maybe problem was easy given the zeros. 
# But we really want classes 1 and -1.
# How would the model do on just the zeros forcing them to score as 1 or -1.
# Or go back to the RNA and classify as + or - without a zero class.
# The visual clustering provided visual intuition of but now we're getting down to individual cases. 

In [22]:
# Compare traditional ML to deep learning.
# Not sure if we want feature selection, but it could extract k-mers indicative of biology.
# Compare to other data sets.

In [23]:
# Go back to the LncAtlas scores rather than 3 bins.
# Try KNN on scores.
# Try logistic regression. Try thresholds. Add inverse log as another feature?

In [24]:
# What did LncADeep use if they didn't use the LncAtlas.
# Try Google collaborate on cloud.
# Can I use Xcede? 
# Price a desktop GPU?
# 8-cluster GPU at WVU called Titan.

In [25]:
# If the training score was 1, the model probably overfit!

In [28]:
# Try again with binary classification.
infile_pos='binary-positive.4mer.features.csv'
infile_neg='binary-negative.4mer.features.csv'
raw_pos = pd.read_csv(infile_pos,header=0)
raw_neg = pd.read_csv(infile_neg,header=0)
train_pos , test_pos = split_train_test(raw_pos,TRAIN_PORTION)
train_neg , test_neg = split_train_test(raw_neg,TRAIN_PORTION)
train_sorted = pd.concat([train_pos, train_neg],axis=0)
train_set = train_sorted.sample(frac=1,random_state=17)
y_train = train_set['label']
X_train_ID = train_set['seqname']
X_train = train_set.drop(['label','seqname'],axis=1)
X_norm=X_train.div(X_train.sum(axis=1), axis=0)
svm = SVC()
svm_accuracy=cross_val_score(svm, X_norm, y_train, cv=3, scoring="accuracy")
svm_accuracy

array([0.80456026, 0.80456026, 0.80130293])