In [79]:
import creatingFeatures as cf
import pandas as pd
import re
import numpy as np

fileNames = ["TR_neg_SPIDER", "TR_pos_SPIDER", "TS_neg_SPIDER", "TS_pos_SPIDER"]
featNames = "AAC,DPC,CTD,PAAC,APAAC,RSacid,RSpolar,RSsecond,RScharge,RSDHP".split(',')

In [100]:
def read_fasta(file):
    line1 = open("./data/" + file + ".txt").read().split('>')[1:]
    line2 = [item.split('\n')[0:-1] for item in line1]
    fasta = [[item[0], re.sub('[^ACDEFGHIKLMNPQRSTVWY]', '', ''.join(item[1:]).upper())] for item in line2]
    return fasta

def createFeatureVectors(fasta, sign):
    feat = cf.Features()

    feat_AAC = feat.AAC(fasta)[0]
    feat_DPC = feat.DPC(fasta, 0)[0]
    feat_CTD = np.hstack((feat.CTDC(fasta)[0], feat.CTDD(fasta)[0], feat.CTDT(fasta)[0]))
    feat_PAAC = feat.PAAC(fasta, 1)[0]
    feat_APAAC = feat.APAAC(fasta, 1)[0]
    feat_RSacid = feat.reducedACID(fasta)
    feat_RSpolar = feat.reducedPOLAR(fasta)
    feat_RSsecond = feat.reducedSECOND(fasta)
    feat_RScharge = feat.reducedCHARGE(fasta)
    feat_RSDHP = feat.reducedDHP(fasta)

    feat_list = [feat_AAC,
                 feat_DPC,
                 feat_CTD,
                 feat_PAAC,
                 feat_APAAC,
                 feat_RSacid,
                 feat_RSpolar,
                 feat_RSsecond,
                 feat_RScharge,
                 feat_RSDHP]

    # print([len(item[0]) for item in feat_list])

    df_main = pd.DataFrame()

    for i, item in enumerate(feat_list):
        df = pd.DataFrame(item, columns=[f"{featNames[i]}_{id}" for id in range(1, len(item[0]) + 1)])
        df_main = pd.concat([df_main, df], axis=1)

    df_main["TARGET"] = 1 if sign == "+" else 0
    
    return df_main

def createDataset(dataframes, feature):
    df_final = pd.DataFrame()

    for df in dataframes:
        df_tmp = df[[col for col in df.columns if col.startswith(feature)]]
        df_tmp = pd.concat([df_tmp, df["TARGET"]], axis=1)
        df_final = pd.concat([df_final, df_tmp], axis=0, ignore_index=True)

    return df_final

In [101]:
fs_tr_neg, fs_tr_pos = read_fasta(fileNames[0]), read_fasta(fileNames[1])
feat_tr_vecs = [createFeatureVectors(fs_tr_neg, "-"), createFeatureVectors(fs_tr_pos, "+")]
feat_tr_vecs[0]

Unnamed: 0,AAC_1,AAC_2,AAC_3,AAC_4,AAC_5,AAC_6,AAC_7,AAC_8,AAC_9,AAC_10,...,RSDHP_32,RSDHP_33,RSDHP_34,RSDHP_35,RSDHP_36,RSDHP_37,RSDHP_38,RSDHP_39,RSDHP_40,TARGET
0,0.000000,0.098039,0.000000,0.039216,0.058824,0.019608,0.000000,0.039216,0.078431,0.078431,...,0.083333,0.444444,0.333333,0.111111,0.000000,0.000000,0.666667,0.000000,0.000000,0
1,0.042105,0.026316,0.042105,0.036842,0.152632,0.057895,0.021053,0.052632,0.021053,0.147368,...,0.100000,0.555556,0.277778,0.055556,0.055556,0.500000,0.125000,0.187500,0.125000,0
2,0.089783,0.037152,0.034056,0.040248,0.027864,0.083591,0.012384,0.037152,0.034056,0.117647,...,0.075000,0.358974,0.282051,0.230769,0.102564,0.440000,0.320000,0.040000,0.160000,0
3,0.000000,0.024390,0.024390,0.073171,0.000000,0.170732,0.024390,0.024390,0.000000,0.024390,...,0.117647,0.444444,0.333333,0.111111,0.000000,0.200000,0.600000,0.000000,0.000000,0
4,0.017544,0.035088,0.052632,0.052632,0.000000,0.070175,0.035088,0.052632,0.035088,0.157895,...,0.055556,0.500000,0.300000,0.100000,0.000000,0.285714,0.142857,0.142857,0.285714,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1314,0.118868,0.018868,0.037736,0.037736,0.032075,0.084906,0.016981,0.013208,0.043396,0.115094,...,0.059211,0.412500,0.275000,0.187500,0.112500,0.585366,0.170732,0.073171,0.146341,0
1315,0.039216,0.013072,0.026144,0.071895,0.045752,0.078431,0.039216,0.026144,0.045752,0.032680,...,0.094340,0.227273,0.500000,0.090909,0.136364,0.312500,0.437500,0.062500,0.125000,0
1316,0.047244,0.015748,0.031496,0.062992,0.039370,0.094488,0.023622,0.023622,0.094488,0.086614,...,0.055556,0.285714,0.214286,0.250000,0.214286,0.538462,0.230769,0.153846,0.000000,0
1317,0.063898,0.015974,0.051118,0.086262,0.035144,0.092652,0.019169,0.038339,0.038339,0.115016,...,0.145455,0.484848,0.303030,0.060606,0.121212,0.340909,0.431818,0.090909,0.090909,0


In [102]:
feat_tr_vecs[1]

Unnamed: 0,AAC_1,AAC_2,AAC_3,AAC_4,AAC_5,AAC_6,AAC_7,AAC_8,AAC_9,AAC_10,...,RSDHP_32,RSDHP_33,RSDHP_34,RSDHP_35,RSDHP_36,RSDHP_37,RSDHP_38,RSDHP_39,RSDHP_40,TARGET
0,0.084367,0.024814,0.047146,0.047146,0.024814,0.086849,0.027295,0.024814,0.042184,0.119107,...,0.093023,0.473684,0.228070,0.175439,0.087719,0.487179,0.282051,0.102564,0.102564,1
1,0.087444,0.035874,0.033632,0.038117,0.060538,0.040359,0.015695,0.085202,0.042601,0.091928,...,0.052980,0.510638,0.191489,0.170213,0.106383,0.303030,0.424242,0.181818,0.060606,1
2,0.062500,0.034375,0.040625,0.059375,0.046875,0.059375,0.026562,0.062500,0.040625,0.096875,...,0.119149,0.338028,0.408451,0.126761,0.112676,0.461538,0.307692,0.123077,0.092308,1
3,0.094945,0.012295,0.045082,0.051230,0.018443,0.267077,0.006148,0.016393,0.038934,0.032787,...,0.137190,0.166667,0.673913,0.050725,0.101449,0.338028,0.422535,0.169014,0.063380,1
4,0.047757,0.034009,0.050651,0.076700,0.042692,0.066570,0.026773,0.047757,0.048480,0.098408,...,0.111579,0.326203,0.401070,0.149733,0.117647,0.451977,0.305085,0.090395,0.146893,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,0.048465,0.009693,0.066236,0.059774,0.033926,0.061389,0.029079,0.050081,0.059774,0.092084,...,0.118483,0.373737,0.313131,0.202020,0.090909,0.392405,0.367089,0.139241,0.088608,1
1219,0.061350,0.020450,0.050613,0.060838,0.069018,0.055726,0.015337,0.069530,0.051125,0.105828,...,0.108197,0.404545,0.368182,0.163636,0.059091,0.383562,0.324201,0.095890,0.191781,1
1220,0.101732,0.021645,0.050866,0.096320,0.041126,0.051948,0.027056,0.028139,0.041126,0.106061,...,0.118321,0.393103,0.275862,0.165517,0.158621,0.416058,0.299270,0.124088,0.153285,1
1221,0.083333,0.020833,0.069444,0.059028,0.027778,0.079861,0.027778,0.079861,0.090278,0.052083,...,0.155556,0.361702,0.340426,0.106383,0.170213,0.315789,0.368421,0.157895,0.105263,1


In [106]:
df = createDataset(feat_tr_vecs, "RSsecond")
X = df.drop("TARGET", axis=1)
y = df["TARGET"]

df

Unnamed: 0,RSsecond_1,RSsecond_2,RSsecond_3,RSsecond_4,RSsecond_5,RSsecond_6,RSsecond_7,RSsecond_8,RSsecond_9,RSsecond_10,...,RSsecond_24,RSsecond_25,RSsecond_26,RSsecond_27,RSsecond_28,RSsecond_29,RSsecond_30,RSsecond_31,RSsecond_32,TARGET
0,0.333333,0.509804,0.156863,0.000000,0.192308,0.000000,0.117647,0.115385,0.125000,0.000000,...,0.222222,0.555556,0.166667,0.407407,0.407407,0.148148,0.111111,0.555556,0.111111,0
1,0.352632,0.352632,0.294737,0.119403,0.074627,0.142857,0.104478,0.432836,0.196429,0.059701,...,0.367647,0.264706,0.352941,0.294118,0.426471,0.250000,0.368421,0.350877,0.263158,0
2,0.408669,0.241486,0.349845,0.219697,0.153846,0.097345,0.098485,0.115385,0.238938,0.030303,...,0.466165,0.187970,0.338346,0.316456,0.253165,0.417722,0.385965,0.289474,0.307018,0
3,0.390244,0.170732,0.439024,0.000000,0.142857,0.055556,0.187500,0.000000,0.388889,0.062500,...,0.352941,0.176471,0.411765,0.375000,0.125000,0.375000,0.315789,0.157895,0.421053,0
4,0.456140,0.245614,0.298246,0.038462,0.142857,0.176471,0.115385,0.000000,0.235294,0.076923,...,0.518519,0.111111,0.333333,0.266667,0.400000,0.266667,0.388889,0.277778,0.222222,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,0.424879,0.240711,0.334410,0.114068,0.040268,0.198068,0.140684,0.140940,0.183575,0.068441,...,0.477273,0.223485,0.291667,0.393333,0.253333,0.346667,0.370192,0.250000,0.375000,1
2538,0.399796,0.323108,0.277096,0.153453,0.063291,0.182657,0.152174,0.213608,0.201107,0.038363,...,0.422733,0.297573,0.278416,0.364929,0.371248,0.262243,0.403315,0.302026,0.290976,1
2539,0.515152,0.226190,0.258658,0.197479,0.095694,0.196653,0.186975,0.181818,0.200837,0.052521,...,0.553459,0.194969,0.249476,0.466667,0.233333,0.290476,0.470833,0.279167,0.245833,1
2540,0.434028,0.305556,0.260417,0.192000,0.068182,0.266667,0.136000,0.090909,0.306667,0.064000,...,0.460317,0.269841,0.261905,0.449438,0.314607,0.224719,0.355263,0.328947,0.289474,1


In [107]:
X

Unnamed: 0,RSsecond_1,RSsecond_2,RSsecond_3,RSsecond_4,RSsecond_5,RSsecond_6,RSsecond_7,RSsecond_8,RSsecond_9,RSsecond_10,...,RSsecond_23,RSsecond_24,RSsecond_25,RSsecond_26,RSsecond_27,RSsecond_28,RSsecond_29,RSsecond_30,RSsecond_31,RSsecond_32
0,0.333333,0.509804,0.156863,0.000000,0.192308,0.000000,0.117647,0.115385,0.125000,0.000000,...,0.192308,0.222222,0.555556,0.166667,0.407407,0.407407,0.148148,0.111111,0.555556,0.111111
1,0.352632,0.352632,0.294737,0.119403,0.074627,0.142857,0.104478,0.432836,0.196429,0.059701,...,0.014925,0.367647,0.264706,0.352941,0.294118,0.426471,0.250000,0.368421,0.350877,0.263158
2,0.408669,0.241486,0.349845,0.219697,0.153846,0.097345,0.098485,0.115385,0.238938,0.030303,...,0.038462,0.466165,0.187970,0.338346,0.316456,0.253165,0.417722,0.385965,0.289474,0.307018
3,0.390244,0.170732,0.439024,0.000000,0.142857,0.055556,0.187500,0.000000,0.388889,0.062500,...,0.000000,0.352941,0.176471,0.411765,0.375000,0.125000,0.375000,0.315789,0.157895,0.421053
4,0.456140,0.245614,0.298246,0.038462,0.142857,0.176471,0.115385,0.000000,0.235294,0.076923,...,0.071429,0.518519,0.111111,0.333333,0.266667,0.400000,0.266667,0.388889,0.277778,0.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,0.424879,0.240711,0.334410,0.114068,0.040268,0.198068,0.140684,0.140940,0.183575,0.068441,...,0.147651,0.477273,0.223485,0.291667,0.393333,0.253333,0.346667,0.370192,0.250000,0.375000
2538,0.399796,0.323108,0.277096,0.153453,0.063291,0.182657,0.152174,0.213608,0.201107,0.038363,...,0.075949,0.422733,0.297573,0.278416,0.364929,0.371248,0.262243,0.403315,0.302026,0.290976
2539,0.515152,0.226190,0.258658,0.197479,0.095694,0.196653,0.186975,0.181818,0.200837,0.052521,...,0.052632,0.553459,0.194969,0.249476,0.466667,0.233333,0.290476,0.470833,0.279167,0.245833
2540,0.434028,0.305556,0.260417,0.192000,0.068182,0.266667,0.136000,0.090909,0.306667,0.064000,...,0.090909,0.460317,0.269841,0.261905,0.449438,0.314607,0.224719,0.355263,0.328947,0.289474


In [108]:
y

0       0
1       0
2       0
3       0
4       0
       ..
2537    1
2538    1
2539    1
2540    1
2541    1
Name: TARGET, Length: 2542, dtype: int64