# Attempt nc/pc classification with K-mers

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# These files could countain K-mers for any K or any combination of K.
X_train=pd.read_pickle("ncRNA.pcRNA.X_train.pkl")
y_train=pd.read_pickle("ncRNA.pcRNA.y_train.pkl")

In [2]:
# Normalize by row sum. 
# Effectively convert k-mer counts to k-mer frequencies.
X_norm=X_train.div(X_train.sum(axis=1), axis=0)
# Feature Scaling by column.
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled = scaler.fit_transform(X_norm)
X_scaled = pd.DataFrame(scaled,columns=X_train.columns)
X_train = X_scaled
X_scaled = None
X_norm = None
X_train

Unnamed: 0,AA,AC,AG,AT,CA,CC,CG,CT,GA,GC,GG,GT,TA,TC,TG,TT
0,-0.004024,-0.659576,0.716697,0.738642,-0.020288,-0.591321,-0.851616,-1.995072,0.627916,-1.334353,0.155050,1.397497,0.534537,-0.681886,-0.123262,1.587485
1,1.110895,-0.773458,0.443664,-0.468627,0.442087,0.184808,-0.526079,0.114514,-1.028308,0.254049,0.824241,-0.247580,-0.000189,-0.299354,-0.566364,-0.851934
2,-0.111784,0.543113,2.125577,-0.689662,0.758004,-0.197977,0.350677,-2.025999,1.707454,-0.339651,1.728564,0.720671,-0.716850,-0.778281,-0.978183,-1.060748
3,-0.053374,-0.383728,-0.143779,-0.279283,-1.616643,-0.913571,2.228392,-2.421931,1.493693,0.124486,0.813607,2.395393,-0.586251,-0.770821,0.325547,-0.020991
4,-1.203859,-0.178941,-0.381052,-1.537152,-0.575941,1.607314,3.003027,-0.730495,-0.517248,2.146672,1.588106,-0.738897,-1.242115,-0.449474,-1.541550,-1.206168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30285,1.613794,-1.135071,-1.087569,2.189131,-1.168236,-1.598192,-1.027905,-2.576919,-1.076636,-1.356709,-1.395530,2.516665,2.464470,-2.474676,1.238035,2.329350
30286,0.015913,-1.197863,1.152874,0.103715,0.261557,-0.261653,-0.929135,-0.288501,0.671909,-0.369324,0.124156,-0.486250,-0.373969,0.075279,-0.128220,0.956217
30287,-0.242799,-0.955269,2.241359,-0.706594,-0.605064,-0.471174,0.298205,-0.621376,0.962984,0.199256,2.723256,-0.430919,0.102037,-0.486458,-1.904974,-1.037355
30288,-0.405485,0.726505,-0.231373,-0.209634,0.556042,0.598814,-0.029205,-0.041988,0.123398,-0.182738,0.190474,-0.182406,-0.483583,0.261818,-0.005944,-0.116536


In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
split=20000

lr = LogisticRegression()
lr.fit( X_train, np.ravel(y_train) )
y_pred=lr.predict(X_train)
lr.score(X_train,y_train) # test set == train set i.e. best case scenario

0.6930670188180917

In [4]:
lr.fit( X_train[:split], np.ravel(y_train[:split]) )
y_pred=lr.predict(X_train[split:])
lr_score=lr.score(X_train[split:],y_train[split:]) # train on 2/3, test on 1/3

In [5]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit( X_train[:split], np.ravel(y_train[:split]) )
y_pred=sgd.predict(X_train[split:])
sgd_score=sgd.score(X_train[split:],y_train[split:]) # train on 2/3, test on 1/3

In [6]:
from sklearn.svm import SVC
svc = SVC()
svc.fit( X_train[:split], np.ravel(y_train[:split]) )
y_pred=svc.predict(X_train[split:])
svc_score=svc.score(X_train[split:],y_train[split:]) # train on 2/3, test on 1/3

In [7]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit( X_train[:split], np.ravel(y_train[:split]) )
y_pred=rfc.predict(X_train[split:])
rfc_score=rfc.score(X_train[split:],y_train[split:]) # train on 2/3, test on 1/3

In [8]:
print("%10f %s"%(lr_score, 'LogisticRegression'))
print("%10f %s"%(sgd_score,'StochasticGradientDescent'))
print("%10f %s"%(svc_score,'SupportVectorMachine'))
print("%10f %s"%(rfc_score,'RandomForestClassifier'))

  0.689699 LogisticRegression
  0.689504 StochasticGradientDescent
  0.779592 SupportVectorMachine
  0.768707 RandomForestClassifier


In [9]:
# Explore parameters for RF and SVM
from sklearn.model_selection import GridSearchCV
# Explore hyper parameters for Random Forest
from sklearn.ensemble import RandomForestRegressor
Rmodel = RandomForestRegressor()
parameter_grid = [
    {'bootstrap':[True],  'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators':[3,10,30], 'max_features':[2,4,6,8]}
]
grid_search = GridSearchCV(Rmodel,
                          parameter_grid,
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit( X_train, np.ravel(y_train) ) # long time!
# grid_search.cv_results_   # big array
# These desribes the best so far.
# Indicates bootstrap=false n_estimators=30 and max_features=6
grid_search.best_estimator_ 
# RandomForestRegressor(bootstrap=False, max_features=6, n_estimators=30)

RandomForestRegressor(bootstrap=False, max_features=6, n_estimators=30)

In [11]:
parameter_grid = [
    {'bootstrap':[False], 'n_estimators':[20,30,40], 'max_features':[5,6,7]},
    {'bootstrap':[False], 'n_estimators':[25,35,45], 'max_features':[5,6,7]}
]
grid_search = GridSearchCV(Rmodel,
                          parameter_grid,
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit( X_train, np.ravel(y_train) ) # long time!
grid_search.best_estimator_ 
# RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=45)

RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=45)

In [12]:
Rmodel = RandomForestRegressor(max_features=5)
parameter_grid = [
    {'bootstrap':[True], 'n_estimators':[45,60,75]} ,
    {'bootstrap':[False], 'n_estimators':[45,60,75]}
]
grid_search = GridSearchCV(Rmodel,
                          parameter_grid,
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit( X_train, np.ravel(y_train) ) # long time!
grid_search.best_estimator_ 
# RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=75)

RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=75)

In [14]:
parameter_grid = [
    {'bootstrap':[True], 'n_estimators':[75,100,125]},
    {'bootstrap':[False], 'n_estimators':[75,100,125]}
]
grid_search = GridSearchCV(Rmodel,
                          parameter_grid,
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit( X_train, np.ravel(y_train) ) # long time!
grid_search.best_estimator_ 

RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=125)

In [15]:
Rmodel = RandomForestRegressor(max_features=5,bootstrap=False)
parameter_grid = [
    {'n_estimators':[125,250,375]},
    {'n_estimators':[200,300,400]}
]
grid_search = GridSearchCV(Rmodel,
                          parameter_grid,
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit( X_train, np.ravel(y_train) ) # long time!
grid_search.best_estimator_ 
# RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=375)

RandomForestRegressor(bootstrap=False, max_features=5, n_estimators=375)

In [18]:
rfc = RandomForestClassifier(bootstrap=False, max_features=5, n_estimators=375)
rfc.fit( X_train[:split], np.ravel(y_train[:split]) )
y_pred=rfc.predict(X_train[split:])
rfc_score=rfc.score(X_train[split:],y_train[split:]) # train on 2/3, test on 1/3
print("%10f %s"%(rfc_score,'RandomForestClassifier'))

  0.768707 RandomForestClassifier
