In [1]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection  import train_test_split
from sklearn.metrics import *
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from numpy import array,argmax,linalg as la
from keras.preprocessing.sequence import pad_sequences
import os
import re
import joblib

In [2]:
def parse_stream(f, comment=b'#'):
    name = None
    sequence = []
    for line in f:
        if line.startswith(comment):
            continue
        line = line.strip()
        if line.startswith(b'>'):
            if name is not None:
                yield name, b''.join(sequence)
            name = line[1:]
            sequence = []
        else:
            sequence.append(line.upper())
    if name is not None:
        yield name, b''.join(sequence)

def fasta2csv(inFasta):
    FastaRead=pd.read_csv(inFasta,header=None)
    print(FastaRead.shape)
    print(FastaRead.head())
    seqNum=int(FastaRead.shape[0]/2)
    csvFile=open("testFasta.csv","w")
    csvFile.write("PID,Seq\n")

    #print("Lines:",FastaRead.shape)
    #print("Seq Num:",seqNum)
    for i in range(seqNum):
      csvFile.write(str(FastaRead.iloc[2*i,0])+","+str(FastaRead.iloc[2*i+1,0])+"\n")


    csvFile.close()
    TrainSeqLabel=pd.read_csv("testFasta.csv",header=0)
    path="testFasta.csv"
    if os.path.exists(path):

        os.remove(path)

    return TrainSeqLabel

In [3]:
inFastaTrain="/content/Train_set.fasta"
inFastaTest="/content/Independent_test_converted.fasta"

mainTrain = fasta2csv(inFastaTrain)
mainTest = fasta2csv(inFastaTest)

# Train set
mainTrain["Tags"] = mainTrain["PID"].apply(lambda pid: 1 if str(pid)[-1] == "1" else 0)

# Test set
mainTest["Tags"] = mainTest["PID"].apply(lambda pid: 1 if str(pid)[-1] == "1" else 0)

# Convert to numpy array
ACP_y_train = mainTrain["Tags"].values
ACP_y_test = mainTest["Tags"].values

ACP_y_train_ = np.array(ACP_y_train, dtype=int)
ACP_y_test_ = np.array(ACP_y_test, dtype=int)

ACP_y_train

(4748, 1)
                                                   0
0                              >1pos|ACP20mainTest|1
1  CETWRTETTGATGQASSLLSGRLLEQKAASCHNSYIVLCIENSFMT...
2                              >2pos|ACP20mainTest|1
3  DERCTIIIHPGSPCDPSDCVQYCYAEYNGVGKCIASKPGRSANCMC...
4                              >3pos|ACP20mainTest|1
(2690, 1)
                       0
0  >1pos|ACP20mainTest|1
1            FLWWLFKWAWK
2  >2pos|ACP20mainTest|1
3          FAKLAKKALAKLL
4  >3pos|ACP20mainTest|1


array([1, 1, 1, ..., 0, 0, 0])

In [4]:
x_train = {}
protein_index = 1
for line in mainTrain["Seq"]:
  x_train[protein_index] = line
  protein_index = protein_index + 1
maxlen_train = max(len(x) for x in x_train.values())

x_test = {}
protein_index = 1
for line in mainTest["Seq"]:
  x_test[protein_index] = line
  protein_index = protein_index + 1
maxlen_test = max(len(x) for x in x_test.values())

maxlen = max(maxlen_train,maxlen_test)

In [5]:
#Convert amino acids to vectors
def OE(seq_temp):
    seq = seq_temp
    chars = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y']
    fea = []
    #k = 6
    for i in range(len(seq)):
        if seq[i] =='A':
            tem_vec = 1
        elif seq[i]=='C':
            tem_vec = 2
        elif seq[i]=='D':
            tem_vec = 3
        elif seq[i]=='E' or seq[i]=='U':
            tem_vec = 4
        elif seq[i]=='F':
            tem_vec = 5
        elif seq[i]=='G':
            tem_vec = 6
        elif seq[i]=='H':
            tem_vec = 7
        elif seq[i]=='I':
            tem_vec = 8
        elif seq[i]=='K':
            tem_vec = 9
        elif seq[i]=='L':
            tem_vec = 10
        elif seq[i]=='M' or seq[i]=='O':
            tem_vec = 11
        elif seq[i]=='N':
            tem_vec = 12
        elif seq[i]=='P':
            tem_vec = 13
        elif seq[i]=='Q':
            tem_vec = 14
        elif seq[i]=='R':
            tem_vec = 15
        elif seq[i]=='S':
            tem_vec = 16
        elif seq[i]=='T':
            tem_vec = 17
        elif seq[i]=='V':
            tem_vec = 18
        elif seq[i]=='W':
            tem_vec = 19
        elif seq[i]=='X' or seq[i]=='B' or seq[i]=='Z':
            tem_vec = 20
        elif seq[i]=='Y':
            tem_vec = 21
        #fea = fea + tem_vec +[i]
        fea.append(tem_vec)
    return fea

x_train_oe = []
for i in x_train:
  oe_feature = OE(x_train[i])
  x_train_oe.append(oe_feature)
  #print(protein_seq_dict[i])
x_test_oe = []
for i in x_test:
  oe_feature = OE(x_test[i])
  x_test_oe.append(oe_feature)

x_train_ = np.array(pad_sequences(x_train_oe, padding='post', maxlen=maxlen))
x_test_ = np.array(pad_sequences(x_test_oe, padding='post', maxlen=maxlen))

x_test_.shape

(1345, 50)

In [6]:
handcraft_AAC_train = [[0] * 21 for _ in range(len(x_train_oe))]
for row in range(len(x_train_oe)):
  seq = x_train_oe[row]
  for i in seq:
    col = i-1
    handcraft_AAC_train[row][col] += 1/len(seq)
hc_AAC_train = np.array(handcraft_AAC_train)

handcraft_AAC_test = [[0] * 21 for _ in range(len(x_test_oe))]
for row in range(len(x_test_oe)):
  seq = x_test_oe[row]
  for i in seq:
    col = i-1
    handcraft_AAC_test[row][col] += 1/len(seq)
hc_AAC_test = np.array(handcraft_AAC_test)

comb = []
for i in range(1,22):
  for j in range(i,22):
    comb.append([i,j])
comb_index = {}
for i in range(len(comb)):
  comb_index[tuple(comb[i])] = i

In [7]:
handcraft_DPC_train = [[0] * len(comb) for _ in range(len(x_train_oe))]
for row in range(len(x_train_oe)):
  seq = x_train_oe[row]
  for i in range(len(seq)-1):
    a = sorted([seq[i],seq[i+1]])
    index = comb_index[tuple(a)]
    handcraft_DPC_train[row][index] += 1/(len(seq)-1)
hc_DPC_train = np.array(handcraft_DPC_train)

handcraft_DPC_test = [[0] * len(comb) for _ in range(len(x_test_oe))]
for row in range(len(x_test_oe)):
  seq = x_test_oe[row]
  for i in range(len(seq)-1):
    a = sorted([seq[i],seq[i+1]])
    index = comb_index[tuple(a)]
    handcraft_DPC_test[row][index] += 1/(len(seq)-1)
hc_DPC_test = np.array(handcraft_DPC_test)

In [8]:
def readFasta(file):
    if os.path.exists(file) == False:
        print('Error: "' + file + '" does not exist.')
        sys.exit(1)

    with open(file) as f:
        records = f.read()

    if re.search('>', records) == None:
        print('The input file seems not in fasta format.')
        sys.exit(1)

    records = records.split('>')[1:]
    myFasta = []
    for fasta in records:
        array = fasta.split('\n')
        name, sequence = array[0].split()[0], re.sub('[^ARNDCQEGHILKMFPSTWYV-]', '-', ''.join(array[1:]).upper())
        myFasta.append([name, sequence])

    return myFasta
def generateGroupPairs(groupKey):
    gPair = {}
    for key1 in groupKey:
        for key2 in groupKey:
            gPair[key1+'.'+key2] = 0
    return gPair


def CKSAAGP(fastas, gap = 5, **kw):

    group = {
        'alphaticr': 'GAVLMI',
        'aromatic': 'FYW',
        'postivecharger': 'KRH',
        'negativecharger': 'DE',
        'uncharger': 'STCPNQ'
    }

    AA = 'ARNDCQEGHILKMFPSTWYV'

    groupKey = group.keys()

    index = {}
    for key in groupKey:
        for aa in group[key]:
            index[aa] = key

    gPairIndex = []
    for key1 in groupKey:
        for key2 in groupKey:
            gPairIndex.append(key1+'.'+key2)

    encodings = []
    header = ['#']
    for g in range(gap + 1):
        for p in gPairIndex:
            header.append(p+'.gap'+str(g))
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = [name]
        for g in range(gap + 1):
            gPair = generateGroupPairs(groupKey)
            sum = 0
            for p1 in range(len(sequence)):
                p2 = p1 + g + 1
                if p2 < len(sequence) and sequence[p1] in AA and sequence[p2] in AA:
                    gPair[index[sequence[p1]]+'.'+index[sequence[p2]]] = gPair[index[sequence[p1]]+'.'+index[sequence[p2]]] + 1
                    sum = sum + 1

            if sum == 0:
                for gp in gPairIndex:
                    code.append(0)
            else:
                for gp in gPairIndex:
                    code.append(gPair[gp] / sum)

        encodings.append(code)

    return encodings

In [9]:
handcraft_CKSAAGP_train = CKSAAGP(readFasta(inFastaTrain))
handcraft_CKS_train = []
for i in range(1,len(handcraft_CKSAAGP_train)):
  handcraft_CKS_train.append(handcraft_CKSAAGP_train[i][1:])
hc_CKS_train = np.array(handcraft_CKS_train)

handcraft_CKSAAGP_test = CKSAAGP(readFasta(inFastaTest))
handcraft_CKS_test = []
for i in range(1,len(handcraft_CKSAAGP_test)):
  handcraft_CKS_test.append(handcraft_CKSAAGP_test[i][1:])
hc_CKS_test = np.array(handcraft_CKS_test)

In [10]:
def TransDict_from_list(groups):
  transDict = dict()
  tar_list = ['0', '1', '2', '3', '4', '5', '6']
  result = {}
  index = 0
  for group in groups:
    g_members = sorted(group)  # Alphabetically sorted list
    for c in g_members:
        # print('c' + str(c))
        # print('g_members[0]' + str(g_members[0]))
        result[c] = str(tar_list[index])  # K:V map, use group's first letter as represent.
    index = index + 1
  return result
def translate_sequence(seq, TranslationDict):
  '''
  Given (seq) - a string/sequence to translate,
  Translates into a reduced alphabet, using a translation dict provided
  by the TransDict_from_list() method.
  Returns the string/sequence in the new, reduced alphabet.
  Remember - in Python string are immutable..
  '''
  import string
  from_list = []
  to_list = []
  for k, v in TranslationDict.items():
      from_list.append(k)
      to_list.append(v)
  # TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list)))
  TRANS_seq = seq.translate(str.maketrans(str(from_list), str(to_list)))
  # TRANS_seq = maketrans( TranslationDict, seq)
  return TRANS_seq
def get_3_protein_trids():
  nucle_com = []
  chars = ['0', '1', '2', '3', '4', '5', '6']
  base = len(chars)
  end = len(chars) ** 3
  for i in range(0, end):
      n = i
      ch0 = chars[n % base]
      n = n / base
      ch1 = chars[int(n % base)]
      n = n / base
      ch2 = chars[int(n % base)]
      nucle_com.append(ch0 + ch1 + ch2)
  return nucle_com
def get_4_nucleotide_composition(tris, seq, pythoncount=True):
  seq_len = len(seq)
  tri_feature = [0] * len(tris)
  k = len(tris[0])
  note_feature = [[0 for cols in range(len(seq) - k + 1)] for rows in range(len(tris))]
  if pythoncount:
      for val in tris:
          num = seq.count(val)
          tri_feature.append(float(num) / seq_len)
  else:
      # tmp_fea = [0] * len(tris)
      for x in range(len(seq) + 1 - k):
          kmer = seq[x:x + k]
          if kmer in tris:
              ind = tris.index(kmer)
              # tmp_fea[ind] = tmp_fea[ind] + 1
              note_feature[ind][x] = note_feature[ind][x] + 1
      # tri_feature = [float(val)/seq_len for val in tmp_fea]    #tri_feature type:list len:256
      u, s, v = la.svd(note_feature)
      for i in range(len(s)):
          tri_feature = tri_feature + u[i] * s[i] / seq_len
      # print tri_feature
      # pdb.set_trace()

  return tri_feature
def prepare_feature_kmer(infile):
  protein_seq_dict = {}
  protein_index = 1
  with open(infile, 'r') as fp:
    for line in fp:
      if line[0] != '>':
        seq = line[:-1]
        protein_seq_dict[protein_index] = seq
        protein_index = protein_index + 1
  kmer = []
  groups = ['AGV', 'ILFP', 'YMTS', 'HNQW', 'RK', 'DE', 'C']
  group_dict = TransDict_from_list(groups)
  protein_tris = get_3_protein_trids()
  # get protein feature
  # pdb.set_trace()
  for i in protein_seq_dict:  # and protein_fea_dict.has_key(protein) and RNA_fea_dict.has_key(RNA):
    protein_seq = translate_sequence(protein_seq_dict[i], group_dict)
    # print('oe:',shape(oe_feature))
    # pdb.set_trace()
    # RNA_tri_fea = get_4_nucleotide_composition(tris, RNA_seq, pythoncount=False)
    protein_tri_fea = get_4_nucleotide_composition(protein_tris, protein_seq, pythoncount =False)
    kmer.append(protein_tri_fea)
    protein_index = protein_index + 1
    # chem_fea.append(chem_tmp_fea)
  return np.array(kmer)

In [11]:
kmer_train = prepare_feature_kmer(inFastaTrain)
kmer_test = prepare_feature_kmer(inFastaTest)

hc_train = np.c_[hc_AAC_train,hc_DPC_train,hc_CKS_train,kmer_train]
hc_train.shape

hc_test = np.c_[hc_AAC_test,hc_DPC_test,hc_CKS_test,kmer_test]
hc_test.shape

X_train = np.c_[hc_train,x_train_]
X_test = np.c_[hc_test,x_test_]

AAC_TRAIN,AAC_VAL,DPC_TRAIN,DPC_VAL,CKS_TRAIN,CKS_VAL,KMER_TRAIN,KMER_VAL,HC_TRAIN,HC_VAL,OE_TRAIN,OE_VAL,X_TRAIN,X_VAL,Y_TRAIN,Y_VAL = train_test_split(hc_AAC_train,hc_DPC_train,hc_CKS_train,kmer_train,hc_train,x_train_,X_train,ACP_y_train_,test_size=0.2,random_state=4)


In [12]:
#XGBboost+AAC
xgb_model = xgb.XGBClassifier()
xgb_model.fit(AAC_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(AAC_VAL)

print("xgb val ACC：" ,str(xgb_model.score(AAC_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(AAC_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb val ACC： 0.8063157894736842
xgb train ACC： 0.9994734070563455
precision_score: 0.8064722769490571
recall_score: 0.8063157894736842
f1_score: 0.8063797013938957


In [13]:
#XGBboost+DPC
xgb_model = xgb.XGBClassifier()
xgb_model.fit(DPC_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(DPC_VAL)

print("xgb val ACC：" ,str(xgb_model.score(DPC_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(DPC_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb val ACC： 0.7642105263157895
xgb train ACC： 0.9994734070563455
precision_score: 0.7648703681514544
recall_score: 0.7642105263157895
f1_score: 0.7644184498371377


In [14]:
#XGBboost+CKS
xgb_model = xgb.XGBClassifier()
xgb_model.fit(CKS_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(CKS_VAL)

print("xgb val ACC：" ,str(xgb_model.score(CKS_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(CKS_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb val ACC： 0.7768421052631579
xgb train ACC： 1.0
precision_score: 0.7809152668906307
recall_score: 0.7768421052631579
f1_score: 0.7772124901667311


In [15]:
#XGBboost+kmer
xgb_model = xgb.XGBClassifier()
xgb_model.fit(KMER_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(KMER_VAL)

print("xgb VAL ACC：" ,str(xgb_model.score(KMER_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(KMER_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb VAL ACC： 0.751578947368421
xgb train ACC： 0.9968404423380727
precision_score: 0.7532534441694299
recall_score: 0.751578947368421
f1_score: 0.7519368391097265


In [16]:
#XGBboost+hc
xgb_model = xgb.XGBClassifier()
xgb_model.fit(HC_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(HC_VAL)

print("xgb VAL ACC：" ,str(xgb_model.score(HC_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(HC_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb VAL ACC： 0.8063157894736842
xgb train ACC： 1.0
precision_score: 0.8083186574354149
recall_score: 0.8063157894736842
f1_score: 0.8066170760233918


In [17]:
#XGBboost+oe
xgb_model = xgb.XGBClassifier()
xgb_model.fit(OE_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(OE_VAL)

print("xgb VAL ACC：" ,str(xgb_model.score(OE_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(OE_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb VAL ACC： 0.7052631578947368
xgb train ACC： 1.0
precision_score: 0.7066481994459833
recall_score: 0.7052631578947368
f1_score: 0.7056434059738286


In [18]:
#XGBboost+all
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_TRAIN,Y_TRAIN)
xgb_predict = xgb_model.predict(X_VAL)

print("xgb ACC：" ,str(xgb_model.score(X_VAL,Y_VAL)))
print("xgb train ACC：" ,str(xgb_model.score(X_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,xgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,xgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,xgb_predict,average = 'weighted')))

xgb ACC： 0.8105263157894737
xgb train ACC： 1.0
precision_score: 0.8117337879110732
recall_score: 0.8105263157894737
f1_score: 0.8107707609831754


In [19]:
import joblib

joblib.dump(xgb_model,filename='xgboost_main8105.joblib')

['xgboost_main8105.joblib']

In [20]:
#lightgbm+AAC
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(AAC_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(AAC_VAL)

print("lgb VAL ACC：" ,str(lgb_model.score(AAC_VAL,Y_VAL)))
print("LGB train ACC：" ,str(lgb_model.score(AAC_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))

[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2061
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb VAL ACC： 0.8021052631578948
LGB train ACC： 0.9989468141126909
precision_score: 0.8029974392212539
recall_score: 0.8021052631578948
f1_score: 0.8023237133534616




In [21]:
#lightgbm+DPC
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(DPC_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(DPC_VAL)

print("lgb ACC：" ,str(lgb_model.score(DPC_VAL,Y_VAL)))
print("LGB train ACC：" ,str(lgb_model.score(DPC_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))

[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7100
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 208
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb ACC： 0.7789473684210526
LGB train ACC： 0.9989468141126909
precision_score: 0.7790292297524634
recall_score: 0.7789473684210526
f1_score: 0.7789848381441782




In [22]:
#lightgbm+CKS
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(CKS_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(CKS_VAL)

print("lgb ACC：" ,str(lgb_model.score(CKS_VAL,Y_VAL)))
print("LGB train ACC：" ,str(lgb_model.score(CKS_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))

[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11911
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 150
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb ACC： 0.791578947368421
LGB train ACC： 1.0
precision_score: 0.7959716505861953
recall_score: 0.791578947368421
f1_score: 0.7919192734441831




In [23]:
#lightgbm+kmer
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(KMER_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(KMER_VAL)

print("lgb ACC：" ,str(lgb_model.score(KMER_VAL,Y_VAL)))
print("LGB train ACC：" ,str(lgb_model.score(KMER_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))

[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10926
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 311
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb ACC： 0.7452631578947368
LGB train ACC： 0.9989468141126909
precision_score: 0.7453561343427061
recall_score: 0.7452631578947368
f1_score: 0.7453063372899575




In [24]:
#lightgbm+oe
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(OE_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(OE_VAL)

print("lgb ACC：" ,str(lgb_model.score(OE_VAL,Y_VAL)))
print("LGB train ACC：" ,str(lgb_model.score(OE_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))



[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1008
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb ACC： 0.7136842105263158
LGB train ACC： 0.9994734070563455
precision_score: 0.7188224660010681
recall_score: 0.7136842105263158
f1_score: 0.7141134505414057




In [25]:
#lightgbm+hc
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(HC_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(HC_VAL)

print("lgb ACC：" ,str(lgb_model.score(HC_VAL,Y_VAL)))
print("LGB train ACC：" ,str(lgb_model.score(HC_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))

[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 31998
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 689
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb ACC： 0.8378947368421052
LGB train ACC： 1.0
precision_score: 0.8388862628362864
recall_score: 0.8378947368421052
f1_score: 0.8380895012032249




In [26]:
import joblib

joblib.dump(lgb_model,filename='lgbm_main8378_hc.joblib')

['lgbm_main8378_hc.joblib']

In [27]:
#lightgbm+all
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_TRAIN,Y_TRAIN)
lgb_predict = lgb_model.predict(X_VAL)

print("lgb ACC：" ,str(lgb_model.score(X_VAL,Y_VAL)))
print("LGBtrain ACC：" ,str(lgb_model.score(X_TRAIN,Y_TRAIN)))
print("precision_score:",str(precision_score(Y_VAL,lgb_predict,average = 'weighted')))
print("recall_score:" , str(recall_score(Y_VAL,lgb_predict,average = 'weighted')))
print("f1_score:",str(f1_score(Y_VAL,lgb_predict,average = 'weighted')))

[LightGBM] [Info] Number of positive: 948, number of negative: 951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 33006
[LightGBM] [Info] Number of data points in the train set: 1899, number of used features: 739
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499210 -> initscore=-0.003160
[LightGBM] [Info] Start training from score -0.003160
lgb ACC： 0.8252631578947368
LGBtrain ACC： 1.0
precision_score: 0.827467512465767
recall_score: 0.8252631578947368
f1_score: 0.8255426759692802




In [28]:
import joblib

model1 = joblib.load(filename='lgbm_main8378_hc.joblib')

# Predict using hc_test which has the same number of features as hc_train
lgb_predict_test = model1.predict(hc_test)

# Print the accuracy on the test set
from sklearn.metrics import accuracy_score
print("lgb test ACC：", str(accuracy_score(ACP_y_test_, lgb_predict_test)))

lgb test ACC： 0.8862453531598513




In [29]:
from sklearn.datasets import *
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# SVM with OE features
classifier = svm.SVC(kernel="rbf")
classifier.fit(OE_TRAIN, Y_TRAIN)
pre_train=classifier.predict(OE_TRAIN)
pre_test=classifier.predict(OE_VAL)
print("SVM (rbf, OE) Train Accuracy:", accuracy_score(Y_TRAIN, pre_train))
print("SVM (rbf, OE) Validation Accuracy:", accuracy_score(Y_VAL, pre_test))

# SVM with HC features
classifier = svm.SVC(kernel="rbf")
classifier.fit(HC_TRAIN, Y_TRAIN)
pre_train=classifier.predict(HC_TRAIN)
pre_test=classifier.predict(HC_VAL)
print("SVM (rbf, HC) Train Accuracy:", accuracy_score(Y_TRAIN, pre_train))
print("SVM (rbf, HC) Validation Accuracy:", accuracy_score(Y_VAL, pre_test))

# SVM with ALL features
classifier = svm.SVC(kernel="rbf")
classifier.fit(X_TRAIN, Y_TRAIN)
pre_train=classifier.predict(X_TRAIN)
pre_test=classifier.predict(X_VAL)
print("SVM (rbf, ALL) Train Accuracy:", accuracy_score(Y_TRAIN, pre_train))
print("SVM (rbf, ALL) Validation Accuracy:", accuracy_score(Y_VAL, pre_test))

# SVM with OE features and poly kernel
classifier = svm.SVC(kernel="poly")
classifier.fit(OE_TRAIN, Y_TRAIN)
pre_train=classifier.predict(OE_TRAIN)
pre_test=classifier.predict(OE_VAL)
print("SVM (poly, OE) Train Accuracy:", accuracy_score(Y_TRAIN, pre_train))
print("SVM (poly, OE) Validation Accuracy:", accuracy_score(Y_VAL, pre_test))

# SVM with OE features and sigmoid kernel
classifier = svm.SVC(kernel="sigmoid")
classifier.fit(OE_TRAIN, Y_TRAIN)
pre_train=classifier.predict(OE_TRAIN)
pre_test=classifier.predict(OE_VAL)
print("SVM (sigmoid, OE) Train Accuracy:", accuracy_score(Y_TRAIN, pre_train))
print("SVM (sigmoid, OE) Validation Accuracy:", accuracy_score(Y_VAL, pre_test))

SVM (rbf, OE) Train Accuracy: 0.7551342812006319
SVM (rbf, OE) Validation Accuracy: 0.5747368421052632
SVM (rbf, HC) Train Accuracy: 0.880463401790416
SVM (rbf, HC) Validation Accuracy: 0.8252631578947368
SVM (rbf, ALL) Train Accuracy: 0.7072143233280674
SVM (rbf, ALL) Validation Accuracy: 0.5810526315789474
SVM (poly, OE) Train Accuracy: 0.7962085308056872
SVM (poly, OE) Validation Accuracy: 0.5873684210526315
SVM (sigmoid, OE) Train Accuracy: 0.5481832543443917
SVM (sigmoid, OE) Validation Accuracy: 0.5410526315789473


In [32]:
import joblib
from sklearn import svm

# Train SVM with HC features for export
classifier_svm_hc = svm.SVC(kernel="rbf")
classifier_svm_hc.fit(HC_TRAIN, Y_TRAIN)

# Export the trained model
joblib.dump(classifier_svm_hc, filename='svm_rbf_hc.joblib')

['svm_rbf_hc.joblib']

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,GridSearchCV

# RandomForest + AAC
forest = RandomForestClassifier(random_state=1)
forest.fit(AAC_TRAIN, Y_TRAIN)
result_aac = forest.score(AAC_VAL,Y_VAL)
print(f"RandomForest (AAC) Validation Accuracy: {result_aac}")

# RandomForest + DPC
forest = RandomForestClassifier(random_state=1)
forest.fit(DPC_TRAIN, Y_TRAIN)
result_dpc = forest.score(DPC_VAL,Y_VAL)
print(f"RandomForest (DPC) Validation Accuracy: {result_dpc}")

# RandomForest + CKS
forest = RandomForestClassifier(random_state=1)
forest.fit(CKS_TRAIN, Y_TRAIN)
result_cks = forest.score(CKS_VAL,Y_VAL)
print(f"RandomForest (CKS) Validation Accuracy: {result_cks}")

# RandomForest + KMER
forest = RandomForestClassifier(random_state=1)
forest.fit(KMER_TRAIN, Y_TRAIN)
result_kmer = forest.score(KMER_VAL,Y_VAL)
print(f"RandomForest (KMER) Validation Accuracy: {result_kmer}")

# RandomForest + OE
forest = RandomForestClassifier(random_state=1)
forest.fit(OE_TRAIN, Y_TRAIN)
result_oe = forest.score(OE_VAL, Y_VAL)
print(f"RandomForest (OE) Validation Accuracy: {result_oe}")

# RandomForest + HC
forest = RandomForestClassifier(random_state=1)
forest.fit(HC_TRAIN, Y_TRAIN)
result_hc = forest.score(HC_VAL,Y_VAL)
print(f"RandomForest (HC) Validation Accuracy: {result_hc}")

# RandomForest + ALL (X_TRAIN combines HC and OE)
forest = RandomForestClassifier(random_state=1)
forest.fit(X_TRAIN, Y_TRAIN)
result_all = forest.score(X_VAL,Y_VAL)
print(f"RandomForest (ALL) Validation Accuracy: {result_all}")

RandomForest (AAC) Validation Accuracy: 0.8210526315789474
RandomForest (DPC) Validation Accuracy: 0.8063157894736842
RandomForest (CKS) Validation Accuracy: 0.7663157894736842
RandomForest (KMER) Validation Accuracy: 0.7389473684210527
RandomForest (OE) Validation Accuracy: 0.671578947368421
RandomForest (HC) Validation Accuracy: 0.8042105263157895
RandomForest (ALL) Validation Accuracy: 0.8084210526315789


In [31]:
import joblib


forest_aac_model = RandomForestClassifier(random_state=1)
forest_aac_model.fit(AAC_TRAIN, Y_TRAIN)

joblib.dump(forest_aac_model, filename='random_forest_aac.joblib')

['random_forest_aac.joblib']