In [40]:
import pandas as pd

from itertools import product

from tqdm import tqdm

In [41]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')

print(train.shape)
print(test.shape)

(67447, 43)
(18816, 42)


In [42]:
bases = {'A', 'C', 'G', 'N', 'T'}

In [43]:
subsequences = []
n = 5
for i in range(1,n+1):
    subsequences.extend(''.join(permutation) for permutation in product(bases, repeat=i))
print(f"Number of subsequences: {len(subsequences)}")
subsequences[:10]

Number of subsequences: 3905


['T', 'A', 'N', 'G', 'C', 'TT', 'TA', 'TN', 'TG', 'TC']

In [44]:
def get_ngram_features(data, subsequences):
    features = pd.DataFrame(index=data.index)
    features['sequence_id']=data['sequence_id']
    for subseq in tqdm(subsequences):
         features[subseq] = data.sequence.str.count(subseq)
    return features

In [45]:
# Calculate n-gram features on our training set
train_ngram_features = get_ngram_features(train, subsequences)
print(train_ngram_features.shape)
train_ngram_features.head()

100%|██████████| 3905/3905 [50:50<00:00,  1.28it/s](67447, 3906)



Unnamed: 0,sequence_id,T,A,N,G,C,TT,TA,TN,TG,...,CCCGT,CCCGA,CCCGN,CCCGG,CCCGC,CCCCT,CCCCA,CCCCN,CCCCG,CCCCC
0,9ZIMC,1586,1848,0,1794,1923,326,285,0,451,...,5,9,0,2,9,8,19,0,6,7
1,5SAQC,107,124,0,107,118,18,12,0,47,...,2,0,0,0,0,2,0,0,2,1
2,E7QRO,72,220,92,971,95,11,9,4,40,...,0,0,0,0,1,0,0,0,0,0
3,CT5FP,196,230,0,235,253,34,27,0,69,...,1,0,0,0,2,1,1,0,1,1
4,7PTD8,243,386,72,415,234,34,51,1,92,...,0,0,0,0,0,0,1,1,0,0


In [46]:
# Calculate n-gram features on our training set
test_ngram_features = get_ngram_features(test, subsequences)
print(test_ngram_features.shape)
test_ngram_features.head()

100%|██████████| 3905/3905 [15:48<00:00,  4.12it/s](18816, 3906)



Unnamed: 0,sequence_id,T,A,N,G,C,TT,TA,TN,TG,...,CCCGT,CCCGA,CCCGN,CCCGG,CCCGC,CCCCT,CCCCA,CCCCN,CCCCG,CCCCC
0,E0VFT,2358,2363,0,2310,2348,432,558,0,674,...,7,9,0,6,11,9,13,0,16,12
1,TTRK5,1435,1454,0,1960,1824,260,230,0,476,...,8,12,0,13,10,5,8,0,10,9
2,2Z7FZ,2157,2222,0,2434,2231,464,437,0,618,...,11,17,0,6,11,6,8,0,10,9
3,VJI6E,174,251,0,365,440,18,15,0,68,...,1,1,0,2,2,1,4,0,2,1
4,721FI,1659,1631,0,1513,1575,381,316,0,412,...,3,5,0,5,8,6,12,0,6,7


In [47]:
train_ngram_features.to_csv('../data/features/ngram/'+str(n)+'_ngram_train.csv',index=False)
test_ngram_features.to_csv('../data/features/ngram/'+str(n)+'_ngram_test.csv',index=False)