In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn import linear_model, datasets
from sklearn.svm import SVC
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
# import PSSM
import random

#### 1.One-Hot & Kmer Encoder

In [2]:
# 转化字典
cdict = dict(A=0b0001, T=0b0010, C=0b0100, G=0b1000)

# 转化函数
def dna_onehot(Xdna):
    listtmp = list()
    for index, row in Xdna.iterrows():
        row = [cdict[x] if x in cdict else x for x in row['Seq']]
        listtmp.append(row)
    return pd.DataFrame(listtmp)

def kmer_to_id(kmer):
    base_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    kmer_id = 0

    for base in kmer:
        kmer_id = kmer_id * 4 + base_map[base]

    return kmer_id

def generate_kmer_ids(sequence, k=6):
    kmer_id_list = []

    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        kmer_id = kmer_to_id(kmer)
        kmer_id_list.append(kmer_id)

    return kmer_id_list


#### 2. Import Model

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn import linear_model, datasets
from sklearn.svm import SVR
import numpy as np
from sklearn import tree
from xgboost import XGBRegressor
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from scipy import interp
import copy
import os
import math
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def lgmain(X_train_std, Y_train, X_test_std, Y_test):
    model = LogisticRegression()
    model.fit(X_train_std, Y_train, sample_weight=None)
    predict = model.predict(X_test_std)
    groundtruth = Y_test
    return groundtruth, predict, model

def svrmain(X_train_std, Y_train, X_test_std, Y_test):
    model = SVR()
    model.fit(X_train_std, Y_train)
    predict = model.predict(X_test_std)
    groundtruth = Y_test
    return groundtruth, predict, model

    
def xgmain(X_train_std, Y_train, X_test_std, Y_test):
    model = XGBRegressor(random_state=42)

    model.fit(X_train_std, Y_train)
    predict = model.predict(X_test_std)
    groundtruth = Y_test
    return groundtruth, predict, model

def dtmain(X_train_std, Y_train, X_test_std, Y_test):
    model = tree.DecisionTreeRegressor()
    model.fit(X_train_std, Y_train)
    predict = model.predict(X_test_std)
    groundtruth = Y_test
    return groundtruth, predict, model

def rfmain(X_train_std, Y_train, X_test_std, Y_test):
    model = RandomForestRegressor()
    model.fit(X_train_std, Y_train)
    predict = model.predict(X_test_std)
    groundtruth = Y_test
    return groundtruth, predict, model

def gbdtmain(X_train_std, Y_train, X_test_std, Y_test):
    model = GradientBoostingRegressor()
    model.fit(X_train_std, Y_train)
    predict = model.predict(X_test_std)
    groundtruth = Y_test
    return groundtruth, predict, model

import time
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

def evaluate(baselineName, ft, dataset):
    X_train_std, Y_train, X_test_std, Y_test = dataset
    start = time.time()

    if baselineName == 'Logistic':
        groundtruth, predict, model = lgmain(X_train_std, Y_train, X_test_std, Y_test)
    elif baselineName == 'XGBoost':
        groundtruth, predict, model = xgmain(X_train_std, Y_train, X_test_std, Y_test)
    elif baselineName == 'DecisionTree':
        groundtruth, predict, model = dtmain(X_train_std, Y_train, X_test_std, Y_test)
    elif baselineName == 'RandomForest':
        groundtruth, predict, model = rfmain(X_train_std, Y_train, X_test_std, Y_test)
    elif baselineName == 'GradientBoosting':
        groundtruth, predict, model = gbdtmain(X_train_std, Y_train, X_test_std, Y_test)
    elif baselineName == 'SVM':
        groundtruth, predict, model = svrmain(X_train_std, Y_train, X_test_std, Y_test)
    else:
        return

    mae = mean_absolute_error(groundtruth, predict)
    mse = mean_squared_error(groundtruth, predict)
    rmse = mean_squared_error(groundtruth, predict)
    r2 = r2_score(groundtruth, predict)

    pearson_corr, _ = pearsonr(np.ravel(groundtruth), np.ravel(predict))
    spearman_corr, _ = spearmanr(np.ravel(groundtruth), np.ravel(predict))

    end = time.time()
    spend = round(end - start, 2)
    print('Running time: %s Seconds' % (end - start))

    item = {
        'BaselineName': baselineName,
        'Feature': ft,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'Pearson': pearson_corr,
        'Spearman': spearman_corr,
        'Time': spend
    }

    return groundtruth, predict, item, model



#### 3. Make Label

In [4]:
data = pd.read_csv("./dataset/ML_data.csv")
rawdata = data[['Seq','平均flu/OD']]
rawdata.columns=['Seq','OD']
rawdata

Unnamed: 0,Seq,OD
0,CGCGCCTTGACGGCTAGCTCAGTCCTAGGTATTGTGCTAGCCGTCG...,11.688265
1,CGCGCCAAAAAGAGTATTGACTTCGCATCTTTTTGTACCCATAATT...,12.008913
2,CGCGCCTTGACATAAAGTCTAACCTATAGGTATAATGTGTGGATCT...,9.565730
3,CGCGCCTTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTG...,11.556572
4,CGCGCCTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTTAAT...,9.913603
...,...,...
562,CGCGCCTTGACATTTATCCCTTGCGGCGATATAATGTGTGGATAAG...,10.105502
563,CGCGCCTTGACATAAAGTCTAACCTATAGGCATAATTATTTCATCC...,9.076736
564,CGCGCCTTGACAGCTAGCTCAGTCCTAGGTATAATGCTAGCACGAA...,8.622338
565,CGCGCCAAAAAGAGTATTGACTTCGCATCTTTTTGTACCTATAATA...,9.005457


In [5]:
X_onehot = dna_onehot(rawdata)
X_kmer = np.array([x for x in rawdata.Seq.apply(generate_kmer_ids).tolist()])
X_list = [X_onehot,X_kmer]
Y = pd.DataFrame(rawdata['OD'])

#### 4.Train

In [6]:
methods=['XGBoost','GradientBoosting']
feature = ['OneHot','K-mer']
method_compare=[]
model_list=[]

for ft in range(len(feature)):
    x=X_list[ft]
    y=Y
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20,random_state=43)
    dataset=X_train, y_train, X_test, y_test
    for method in methods:
        print('='*100)
        print('BaseLine : '+method) 
        groundtruth, predict,item, model=evaluate(method, feature[ft], dataset)
        method_compare.append(item)
        model_list.append(model)
    pd.DataFrame(method_compare)

  return self.partial_fit(X, y)


BaseLine : XGBoost
Running time: 0.7000992298126221 Seconds
BaseLine : GradientBoosting


  y = column_or_1d(y, warn=True)


Running time: 0.318835973739624 Seconds
BaseLine : XGBoost




Running time: 0.7401700019836426 Seconds
BaseLine : GradientBoosting


  y = column_or_1d(y, warn=True)


Running time: 0.5141189098358154 Seconds


In [7]:
pd.DataFrame(method_compare)

Unnamed: 0,BaselineName,Feature,MAE,MSE,RMSE,R2,Pearson,Spearman,Time
0,XGBoost,OneHot,0.963822,1.550907,1.550907,0.15549,0.495108,0.517531,0.7
1,GradientBoosting,OneHot,0.89349,1.323593,1.323593,0.279268,0.558681,0.576128,0.32
2,XGBoost,K-mer,0.919135,1.403047,1.403047,0.236004,0.533585,0.544294,0.74
3,GradientBoosting,K-mer,0.88846,1.294268,1.294268,0.295237,0.560027,0.566667,0.51


In [8]:
pd.DataFrame(method_compare).to_csv("output/ML_Report.csv",index=False)