# Dataset creation and refinement

## Getting sequences from RepeatMasker annotaion

In [1]:
from pyfaidx import Fasta
import pandas as pd

In [None]:
# Input files
FASTA='RepeatMasker_masked_sequences.fasta'
COORD='db_te_class_beta_1.csv'
# Read fasta
tes = Fasta(FASTA)

In [2]:
df = pd.read_csv(COORD,usecols=('id','start','end'))
df

Unnamed: 0,id,start,end
0,5S,1,119
1,5S-Sauria,1,348
2,5S_CPo,122,136
3,AACOPIA1_I,1,4110
4,AACOPIA1_LTR,1,270
...,...,...,...
56662,X9c_DNA_2,5,287
56663,hAT-N39_DR_1,1,1248
56664,hAT-N39_DR_1,1346,1594
56665,nhAT1_ML_1,1,235


In [3]:
df.loc[df.duplicated()]

Unnamed: 0,id,start,end


In [62]:
df.drop_duplicates(inplace=True, ignore_index=True)

In [4]:
valores = df.values.tolist()
valores[1]

['5S-Sauria', 1, 348]

In [5]:
for item in tes.keys():
    if 'TIGGER2' in item:
        print(item)

TIGGER2
TIGGER2_1
TIGGER2_2
TIGGER2_3
TIGGER2_4


In [6]:
with open('db_te_class_beta_1.fasta','w') as fout:    
    for fcoord in valores:
        nome, comeco, fim = fcoord
        fseq = tes.get_seq(nome,comeco,fim)
        fseq = str(fseq)
        ffull = f'>{nome}:{comeco}-{fim}\n{fseq}\n'
        fout.write(ffull)
        fcoord.append(fseq)

In [7]:
print(valores[1])

['5S-Sauria', 1, 348, 'gcctacggccataccaccctgaacacgcccgatctcgtctgatctcggaagctaagcagggtcgggcctggttagtacttggatgggagaccgcctgggaataccgggtgctgtaggctttagccccagcttctgccaacctagcagttcgaaaacatgcaaatgtgagtagatcaataggtaccgctccggcgggaaggtaacggcgctccatgcagtcatgccggccacatgaccttggaggtgtctacggacaacgccggctcttcggcttagaaatggagatgagcaccaacccccagagtcggacatgactggacttaatgtcaggggaaaacctttaccttt']


In [8]:
novo_df = pd.DataFrame(valores,columns=['id', 'start', 'end', 'sequence'])
novo_df.head()

Unnamed: 0,id,start,end,sequence
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,5S_CPo,122,136,aaaaaaaaaaaaaaa
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...


In [9]:
novo_df['length'] = novo_df['sequence'].apply(len)
novo_df.head()

Unnamed: 0,id,start,end,sequence,length
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348
2,5S_CPo,122,136,aaaaaaaaaaaaaaa,15
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270


## Cleaning and filtering

In [None]:
# Removing unwanted characters from sequences
unwnts = 'bdhkmrsvwxy'
for unwnt in unwnts:
    novo_df['sequence']=novo_df['sequence'].str.replace(unwnt,'n')

In [11]:
novo_df.loc[novo_df['sequence'].str.contains('n')]

Unnamed: 0,id,start,end,sequence,length
5,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188
11,ACASINE2,1,474,ggggctgcggtggcacaatgggttaaacccttgtgctgctgaactg...,474
22,ACROBAT1,864,1379,ntagtgacagaataaaattgattgtttttggtcaattttgacagct...,516
24,ACSINE1,1,214,tgcaaaggatcgttaccttgacgcggtgctggggcttgngcgcctc...,214
29,AFROSINE,1,160,tgctaaccaaaaggtcggcagttcgaaaccaccagctgctccangg...,160
...,...,...,...,...,...
56661,X9c_DNA_1,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56662,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56663,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248
56665,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235


In [12]:
novo_df.loc[novo_df['id'].str.contains('TIGGER2')]

Unnamed: 0,id,start,end,sequence,length
45216,TIGGER2,1,2708,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2708
55800,TIGGER2_1,1,2718,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2718
55801,TIGGER2_2,1,2708,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2708
55802,TIGGER2_3,1,2718,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2718
55803,TIGGER2_4,1,2708,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2708


Getting the 'n' character frequency

In [13]:
print(novo_df.loc[novo_df['sequence'].str.count('n').idxmax()]) # idxmax() returns the index
contagem = novo_df["sequence"].str.count('n').max() # max() returns the max value
print(f'There is {contagem} occurences of "n" in this sequence')

id                                                 AviRTE_ZCh
start                                                     546
end                                                      3383
sequence    agcatcctgaatgaccaagcagcctttttcaaggagagcactgctt...
length                                                   2838
Name: 759, dtype: object
There is 677 occurences of "n" in this sequence


Filtering dataframe by length and 'n' frequency

In [14]:
novo_df.loc[novo_df['length'] <= 50]

Unnamed: 0,id,start,end,sequence,length
2,5S_CPo,122,136,aaaaaaaaaaaaaaa,15
40,AGM1,1,42,tcgttgctaaccaaaaccatccaacgaaaaacacattcaaat,42
495,ATMU10,3176,3214,gctctgtcccaaccgtctctgtcccaacagtcccaaaga,39
816,BARE1_HV,7079,7103,attgactctagtgcaagtgggagac,25
843,BEL-1-LTR_NVi,945,969,atttgaactcattagcttttgttca,25
...,...,...,...,...,...
56319,hAT-N18_ZM_1,1,14,tagggctgggcaaa,14
56373,CR1-1_HM_1,27,37,tcatgatcatc,11
56374,CR1-1_HM_1,82,98,tgctgccttgcaggatt,17
56380,DNA-2-32_DR_1,1,25,cccttacagaaaagtcacatgaatt,25


In [15]:
# setting sequence length threshold
novo_df=novo_df.loc[novo_df['length'] >= 50]
novo_df

Unnamed: 0,id,start,end,sequence,length
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270
5,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188
...,...,...,...,...,...
56662,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56663,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248
56664,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249
56665,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235


In [16]:
frequencia = (novo_df['sequence'].str.count('n')/novo_df['length'])*100
print(novo_df.loc[frequencia.idxmax()])
print(frequencia.max())

id                                                 AviRTE_ZCh
start                                                    3721
end                                                      3992
sequence    tggtctaccctggcctccaatcgggaggcctggagacacaccatct...
length                                                    272
Name: 760, dtype: object
28.676470588235293


In [17]:
full_df = pd.read_csv(COORD)
full_df.loc[full_df['id'].isin(novo_df.loc[frequencia >= 15]['id'])]

Unnamed: 0,id,class,order,family,score,start,end
758,AviRTE_ZCh,Retro,LINE,RTE,1396,81,232
759,AviRTE_ZCh,Retro,LINE,RTE,16839,546,3383
760,AviRTE_ZCh,Retro,LINE,RTE,1358,3721,3992
29285,Helitron-N3_NV,DNA,dnaSC2,Helitron,6946,1,921
53000,Hoana8_1,DNA,TIR,hAT,17959,1,1928
53001,Hoana8_1,DNA,TIR,hAT,1607,2200,2371
53002,Hoana8_1,DNA,TIR,hAT,6547,2417,3123
53003,Hoana8_1,DNA,dnaSC2,Helitron,1229,2083,2333


In [20]:
# checking what sequences have 'n' frequency equal or grater than value
novo_df.loc[frequencia >= 15]

Unnamed: 0,id,start,end,sequence,length
759,AviRTE_ZCh,546,3383,agcatcctgaatgaccaagcagcctttttcaaggagagcactgctt...,2838
760,AviRTE_ZCh,3721,3992,tggtctaccctggcctccaatcgggaggcctggagacacaccatct...,272
29285,Helitron-N3_NV,1,921,ctcgggctacctgtggtagcacgagtctaggtttacctttcctata...,921
53003,Hoana8_1,2083,2333,gtgttttttaagttagagggttgggactttccacacatgttatatt...,251


In [22]:
full_df.loc[full_df['id'].isin(novo_df.loc[frequencia >= 15]['id'])]['order'].value_counts()

Series([], Name: order, dtype: int64)

In [23]:
full_df['order'].value_counts()

LTR         29635
TIR         15180
LINE         7811
dnaSC2       1505
SINE         1018
Penelope      676
DIRS          480
Crypton       362
Name: order, dtype: int64

In [21]:
novo_df=novo_df.loc[frequencia <= 15]
novo_df

Unnamed: 0,id,start,end,sequence,length
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270
5,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188
...,...,...,...,...,...
56662,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56663,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248
56664,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249
56665,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235


## Dataset completion

In [25]:
merged = pd.merge(novo_df,full_df,on=['id','start','end'],how='inner')
merged

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119,Retro,SINE,5S,844
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348,Retro,SINE,5S,3230
2,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110,Retro,LTR,Copia,38433
3,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270,Retro,LTR,Copia,2518
4,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188,Retro,LINE,I,38902
...,...,...,...,...,...,...,...,...,...
56240,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283,DNA,TIR,hAT,2291
56241,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248,DNA,TIR,hAT,13580
56242,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249,DNA,TIR,hAT,13580
56243,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235,DNA,TIR,hAT,2167


In [28]:
merged.loc[merged['id'].str.contains('TIGGER1')]

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
44898,TIGGER1,1,2418,caggcatacctcgttttattgcgcttcgctttattgcgcttcgcag...,2418,DNA,TIR,TcMar,22332
55386,TIGGER1_1,1,2418,caggcatacctcgttttattgcgcttcgctttattgcgcttcgcag...,2418,DNA,TIR,TcMar,22332
55387,TIGGER1_2,1,2418,caggcatacctcgttttattgcgcttcgctttattgcgcttcgcag...,2418,DNA,TIR,TcMar,22332


In [234]:
teste = pd.merge(novo_df,merged, indicator=True, how='outer')
teste_diff = teste.loc[teste['_merge'] != 'both']
teste_diff

Unnamed: 0,id,start,end,sequence,length,class,order,family,species,score,_merge


Checking for duplicated sequences

In [32]:
dup_seqs = merged.loc[merged['sequence'].duplicated()]
dup_seqs

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
135,ATCOPIA23I,1,4502,tggtatcagagcagtgattctaaaagacctaatttttttttttttt...,4502,Retro,LTR,Copia,42203
136,ATCOPIA23LTR,1,120,tataaggagttgtatatattaaggatataattgtcattagtagttg...,120,Retro,LTR,Copia,1063
322,ATDNA12T3_2,2,358,tgtaacgccccgaccgcccctaccagtgggccccacgtcctctctc...,357,Retro,LTR,Cassandra,3186
323,ATDNA12T3_2,359,470,attcacccccactaacagatcgcaacgtcctcgttgcacaccaaga...,112,Retro,LTR,Cassandra,866
356,ATGP1I,1,5977,atttggtatcagagcgattacggttctaggatgtgtagaaaaatta...,5977,Retro,LTR,Gypsy,54683
...,...,...,...,...,...,...,...,...,...
56236,X1_DNA_2,1,606,tatgcaaaggatatattcctgaagancttgcataattcaaaactta...,606,DNA,TIR,TcMar,4423
56237,X21_DNA_1,14,159,atctccttgtctgttttctgcctgcatagtcatgcacagctttctg...,146,DNA,TIR,hAT,1028
56238,X21_DNA_2,14,159,atctccttgtctgttttctgcctgcatagtcatgcacagctttctg...,146,DNA,TIR,hAT,1028
56239,X9c_DNA_1,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283,DNA,TIR,hAT,2291


In [46]:
merged.shape

(56245, 9)

Removing duplicates

In [47]:
merged_no_dup_seq = merged.loc[~merged['sequence'].duplicated()]
merged_no_dup_seq

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119,Retro,SINE,5S,844
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348,Retro,SINE,5S,3230
2,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110,Retro,LTR,Copia,38433
3,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270,Retro,LTR,Copia,2518
4,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188,Retro,LINE,I,38902
...,...,...,...,...,...,...,...,...,...
56193,UCON15_1,1,165,ttcatctaaggancaaacaccttaagctgttgttttcaagttttat...,165,DNA,TIR,hAT,1255
56241,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248,DNA,TIR,hAT,13580
56242,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249,DNA,TIR,hAT,13580
56243,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235,DNA,TIR,hAT,2167


In [48]:
merged_no_dup_seq.to_csv('db_te_classified_1.1v_nds.csv',index=False)

Getting species for TEs

In [2]:
merged = pd.read_csv('db_te_classified_2.0v.csv')
merged.head()

Unnamed: 0,id,start,end,length,class,order,family,score,sequence
0,5S,1,119,119,Retro,SINE,5S,844,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...


In [3]:
spps = pd.read_csv('tes_species.csv', names = ('id','species'))
spps.head()

Unnamed: 0,id,species
0,4.5SRNA,Rodentia
1,5S_CPo_1,Cavia porcellus
2,5S_CPo,Cavia porcellus
3,5S_DM,Drosophila melanogaster
4,5S,Homo sapiens


In [5]:
spp_merged = merged.merge(spps, on = 'id', how = 'inner', sort = True)
spp_merged

Unnamed: 0,id,start,end,length,class,order,family,score,sequence,species
0,5S,1,119,119,Retro,SINE,5S,844,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,Homo sapiens
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,Anolis carolinensis
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,Aedes aegypti
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,Aedes aegypti
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,Anopheles gambiae
...,...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,tgtaacagccctaggttcataaccaggtatttagaagtgtatttgt...,Oryza sativa
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,tgtaacagccctaggttcataaccaggtatttagaagtgcttttgt...,Oryza sativa
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,gcccggctagctcagtcggtagagcatgagactcttaatctcaggg...,Vertebrata
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,ggggcgcctggctggctcagtcggtagagcatgcgactcttgatct...,Felis catus


In [6]:
spp_merged.columns

Index(['id', 'start', 'end', 'length', 'class', 'order', 'family', 'score',
       'sequence', 'species'],
      dtype='object')

In [7]:
spp_merged = spp_merged[['id', 'start', 'end', 'length', 'class', 'order', 'family',
       'score', 'species', 'sequence']]
spp_merged

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,5S,1,119,119,Retro,SINE,5S,844,Homo sapiens,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,Anolis carolinensis,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,Aedes aegypti,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,Aedes aegypti,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,Anopheles gambiae,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...
...,...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,Oryza sativa,tgtaacagccctaggttcataaccaggtatttagaagtgtatttgt...
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,Oryza sativa,tgtaacagccctaggttcataaccaggtatttagaagtgcttttgt...
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,Vertebrata,gcccggctagctcagtcggtagagcatgagactcttaatctcaggg...
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,Felis catus,ggggcgcctggctggctcagtcggtagagcatgcgactcttgatct...


In [8]:
spp_merged.to_csv('db_te_classified_2.1v.csv',index=False)

## Data augmentation using reverse complement sequences

In [9]:
print(spp_merged['class'].value_counts())
print('====================')
print(spp_merged['order'].value_counts())
print('====================')
print(spp_merged['family'].value_counts())

Retro    35663
DNA      14960
Name: class, dtype: int64
LTR         26827
TIR         13252
LINE         7008
dnaSC2       1380
SINE          729
Penelope      653
DIRS          446
Crypton       328
Name: order, dtype: int64
Gypsy    12348
Copia     7322
hAT       3820
TcMar     2985
ERV1      2611
         ...  
Genie        3
SVA          2
Ceph         2
R4           2
TRIM         2
Name: family, Length: 64, dtype: int64


In [10]:
def reverse_complement(dna):
    complement = {'a':'t','c':'g','g':'c','t':'a','n':'n'}
    return ''.join([complement[base] for base in dna[::-1]])

Reverse complement dataset

In [11]:
copiado = spp_merged.copy()
copiado

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,5S,1,119,119,Retro,SINE,5S,844,Homo sapiens,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,Anolis carolinensis,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,Aedes aegypti,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,Aedes aegypti,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,Anopheles gambiae,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...
...,...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,Oryza sativa,tgtaacagccctaggttcataaccaggtatttagaagtgtatttgt...
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,Oryza sativa,tgtaacagccctaggttcataaccaggtatttagaagtgcttttgt...
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,Vertebrata,gcccggctagctcagtcggtagagcatgagactcttaatctcaggg...
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,Felis catus,ggggcgcctggctggctcagtcggtagagcatgcgactcttgatct...


In [12]:
# Replacing sequences with reverse complement sequences in copied dataframe
copiado['sequence'] = copiado['sequence'].apply(reverse_complement)
copiado

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,5S,1,119,119,Retro,SINE,5S,844,Homo sapiens,aaccctacagaacccggtgttcccaggctctctcccatcaggtacc...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,Anolis carolinensis,aaaggtaaaggttttcccctgacattaagtccagtcatgtccgact...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,Aedes aegypti,ctcctcctcagcggttacctgcttcactgcggatcctccagtccca...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,Aedes aegypti,tgttagcagaaacaggcaacacagcggatcgggggattcgggagaa...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,Anopheles gambiae,gtgtctttccaggctgttgtgatggtttgatggttcgaagaaagaa...
...,...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,Oryza sativa,gctcacatggcagcaacctggacgtagggtcgtgagagactaaggc...
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,Oryza sativa,tgtggacaccccgagtattataccatccagtgtccaacgcgaccac...
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,Vertebrata,tggcgcccaacgtggggctcgaacccacgaccctgagattaagagt...
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,Felis catus,tttttaaagattttatttttaagtaatctctacacccaacgtgggg...


In [13]:
# adding reverse complement identifier
copiado['id'] = copiado['id'] + '_revcomp'
copiado

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,5S_revcomp,1,119,119,Retro,SINE,5S,844,Homo sapiens,aaccctacagaacccggtgttcccaggctctctcccatcaggtacc...
1,5S-Sauria_revcomp,1,348,348,Retro,SINE,5S,3230,Anolis carolinensis,aaaggtaaaggttttcccctgacattaagtccagtcatgtccgact...
2,AACOPIA1_I_revcomp,1,4110,4110,Retro,LTR,Copia,38433,Aedes aegypti,ctcctcctcagcggttacctgcttcactgcggatcctccagtccca...
3,AACOPIA1_LTR_revcomp,1,270,270,Retro,LTR,Copia,2518,Aedes aegypti,tgttagcagaaacaggcaacacagcggatcgggggattcgggagaa...
4,AARA8_AG_revcomp,1,4188,4188,Retro,LINE,I,38902,Anopheles gambiae,gtgtctttccaggctgttgtgatggtttgatggttcgaagaaagaa...
...,...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR_revcomp,1,1138,1138,Retro,LTR,Gypsy,10061,Oryza sativa,gctcacatggcagcaacctggacgtagggtcgtgagagactaaggc...
50619,rn_364-201_LTR_1_revcomp,1,3868,3868,Retro,LTR,Gypsy,36247,Oryza sativa,tgtggacaccccgagtattataccatccagtgtccaacgcgaccac...
50620,tRNA-Lys-AAG_revcomp,1,76,76,Retro,LTR,Gypsy,685,Vertebrata,tggcgcccaacgtggggctcgaacccacgaccctgagattaagagt...
50621,tSINE_Fc_revcomp,1,110,110,Retro,SINE,tRNA,988,Felis catus,tttttaaagattttatttttaagtaatctctacacccaacgtgggg...


In [14]:
copiado.to_csv('db_te_classified_2.1v_revcomp.csv', index=False)

Dataframes concatenation

In [24]:
concatenado = pd.concat([spp_merged,copiado],ignore_index=True)
concatenado

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,5S,1,119,119,Retro,SINE,5S,844,Homo sapiens,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,Anolis carolinensis,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,Aedes aegypti,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,Aedes aegypti,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,Anopheles gambiae,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...
...,...,...,...,...,...,...,...,...,...,...
101241,rn_364-201_LTR_revcomp,1,1138,1138,Retro,LTR,Gypsy,10061,Oryza sativa,gctcacatggcagcaacctggacgtagggtcgtgagagactaaggc...
101242,rn_364-201_LTR_1_revcomp,1,3868,3868,Retro,LTR,Gypsy,36247,Oryza sativa,tgtggacaccccgagtattataccatccagtgtccaacgcgaccac...
101243,tRNA-Lys-AAG_revcomp,1,76,76,Retro,LTR,Gypsy,685,Vertebrata,tggcgcccaacgtggggctcgaacccacgaccctgagattaagagt...
101244,tSINE_Fc_revcomp,1,110,110,Retro,SINE,tRNA,988,Felis catus,tttttaaagattttatttttaagtaatctctacacccaacgtgggg...


## Non-TE dataset

In [76]:
non_te = pd.read_csv('tds_feats_nadd_02.01.csv')
non_te

Unnamed: 0,id,label,class,family,repmask_score,species,sequence,length
0,AC9_ZM,TE,DNA,hAT,44196.0,Zea mays,CTGCAGCATTATCCATCGCGTTGCTCACACGTATGTTCCAGTAGAA...,4816
1,ACCORD2_I,TE,LTR,Gypsy,63155.0,Drosophila melanogaster,GGCGCAGCCGGTCAACGGACAAGGGATTAACAATGCCCACTACGCT...,7218
2,ACCORD2_LTR,TE,LTR,Gypsy,1984.0,Drosophila melanogaster,AGTTAACATACCCCCTCTCTCGGTCTACCCAGTACTGCAACTGAGA...,221
3,ACCORD_I,TE,LTR,Gypsy,56172.0,Drosophila melanogaster,GGCGCAGCCGGACTAGAACTAAAAGGATTAACAACGCCTTTCACGT...,6294
4,ACCORD_LTR,TE,LTR,Gypsy,5166.0,Drosophila melanogaster,AGTTACCATGCCCAGCATTAACCCCCCTCAACAACCACCTCCGCCT...,564
...,...,...,...,...,...,...,...,...
15305,ENSDART00000192815.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAGGCT...,303
15306,ENSDART00000179671.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTGCTTGGAGGCT...,303
15307,ENSDART00000187802.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAAGCT...,303
15308,ENSDART00000185045.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTGCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAGGCT...,303


In [77]:
non_te = non_te.loc[non_te['id'].str.contains('ENSDART')]
non_te.reset_index(drop=True,inplace=True)
non_te

Unnamed: 0,id,label,class,family,repmask_score,species,sequence,length
0,ENSDART00000190199.1 seq1,NT,NT,NT,,Danio rerio,ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCAC...,170
1,ENSDART00000183292.1 seq1,NT,NT,NT,,Danio rerio,ATACTTACCTGGCAGGGGAGACACCATGATCAGGAAGGTGGTTCAC...,171
2,ENSDART00000184887.1 seq1,NT,NT,NT,,Danio rerio,ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCAC...,170
3,ENSDART00000192680.1 seq1,NT,NT,NT,,Danio rerio,GTTTTGAAAAAAAAAAAACATATACTTGGATCGATACAGAGAATAT...,108
4,ENSDART00000191293.1 seq1,NT,NT,NT,,Danio rerio,GTGCTTGCTACGGTGGCACATATACTAAAATTGGATCGATACAGAG...,113
...,...,...,...,...,...,...,...,...
7767,ENSDART00000192815.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAGGCT...,303
7768,ENSDART00000179671.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTGCTTGGAGGCT...,303
7769,ENSDART00000187802.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAAGCT...,303
7770,ENSDART00000185045.1 seq1,NT,NT,NT,,Danio rerio,GCCGGGTGCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAGGCT...,303


In [78]:
non_te['sequence'][0]

'ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCACCCAGGGCGAGGCTTGGCCATTGCACTCCGGCCACGCTGACCCCTGCGAATTCCCCAAATGTGGGAATCTCAACTGCATAATTTCTGGTAGTGGGGGACTGCGTTCGCGCTCTCCCCTGNNNNNN'

In [79]:
print('# TEs dataframe columns')
print(', '.join(concatenado.columns.values))
print('')
print('# Non-TEs dataframe columns')
print(', '.join(non_te.columns.values))

# TEs dataframe columns
id, start, end, length, class, order, family, score, species, sequence

# Non-TEs dataframe columns
id, label, class, family, repmask_score, species, sequence, length


In [80]:
non_te.columns = ['id', 'class', 'order', 'family', 'score', 'species', 'sequence', 'length']
non_te = non_te[['id', 'class', 'order', 'family', 'species', 'length', 'sequence']]
non_te.head()

Unnamed: 0,id,class,order,family,species,length,sequence
0,ENSDART00000190199.1 seq1,NT,NT,NT,Danio rerio,170,ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCAC...
1,ENSDART00000183292.1 seq1,NT,NT,NT,Danio rerio,171,ATACTTACCTGGCAGGGGAGACACCATGATCAGGAAGGTGGTTCAC...
2,ENSDART00000184887.1 seq1,NT,NT,NT,Danio rerio,170,ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCAC...
3,ENSDART00000192680.1 seq1,NT,NT,NT,Danio rerio,108,GTTTTGAAAAAAAAAAAACATATACTTGGATCGATACAGAGAATAT...
4,ENSDART00000191293.1 seq1,NT,NT,NT,Danio rerio,113,GTGCTTGCTACGGTGGCACATATACTAAAATTGGATCGATACAGAG...


In [83]:
non_te = non_te.loc[non_te['length'] >= 50]
non_te

Unnamed: 0,id,class,order,family,species,length,sequence
0,ENSDART00000190199.1 seq1,NT,NT,NT,Danio rerio,170,ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCAC...
1,ENSDART00000183292.1 seq1,NT,NT,NT,Danio rerio,171,ATACTTACCTGGCAGGGGAGACACCATGATCAGGAAGGTGGTTCAC...
2,ENSDART00000184887.1 seq1,NT,NT,NT,Danio rerio,170,ATACTTACCTGGCAGGGGAGATACCATGATCAAGAAGGTGGTTCAC...
3,ENSDART00000192680.1 seq1,NT,NT,NT,Danio rerio,108,GTTTTGAAAAAAAAAAAACATATACTTGGATCGATACAGAGAATAT...
4,ENSDART00000191293.1 seq1,NT,NT,NT,Danio rerio,113,GTGCTTGCTACGGTGGCACATATACTAAAATTGGATCGATACAGAG...
...,...,...,...,...,...,...,...
7767,ENSDART00000192815.1 seq1,NT,NT,NT,Danio rerio,303,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAGGCT...
7768,ENSDART00000179671.1 seq1,NT,NT,NT,Danio rerio,303,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTGCTTGGAGGCT...
7769,ENSDART00000187802.1 seq1,NT,NT,NT,Danio rerio,303,GCCGGGTTCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAAGCT...
7770,ENSDART00000185045.1 seq1,NT,NT,NT,Danio rerio,303,GCCGGGTGCAGTGGCGCGCGCCTGTAATCCAAGCTACTGGGAGGCT...


In [86]:
non_te['sequence'] = non_te['sequence'].str.lower()
non_te.head()

Unnamed: 0,id,class,order,family,species,length,sequence
0,ENSDART00000190199.1 seq1,NT,NT,NT,Danio rerio,170,atacttacctggcaggggagataccatgatcaagaaggtggttcac...
1,ENSDART00000183292.1 seq1,NT,NT,NT,Danio rerio,171,atacttacctggcaggggagacaccatgatcaggaaggtggttcac...
2,ENSDART00000184887.1 seq1,NT,NT,NT,Danio rerio,170,atacttacctggcaggggagataccatgatcaagaaggtggttcac...
3,ENSDART00000192680.1 seq1,NT,NT,NT,Danio rerio,108,gttttgaaaaaaaaaaaacatatacttggatcgatacagagaatat...
4,ENSDART00000191293.1 seq1,NT,NT,NT,Danio rerio,113,gtgcttgctacggtggcacatatactaaaattggatcgatacagag...


In [87]:
non_te.to_csv('non_te_dataset.csv', index = False)

## Stratify dataset

In [3]:
import pandas as pd

In [29]:
tes = pd.read_csv('db_te_classified_2.0v.csv', usecols = ('id','class','sequence'))
non = pd.read_csv('non_te_dataset.csv', usecols = ('id','class','sequence'))

In [30]:
tes.shape

(50623, 3)

In [32]:
tes['sequence'] = tes['sequence'] + 'nnnnnn'

In [33]:
non.shape

(15540, 3)

In [34]:
tes['class'].value_counts()

Retro    35663
DNA      14960
Name: class, dtype: int64

In [35]:
retro = tes.loc[tes['class']=='Retro']
dna = tes.loc[tes['class']=='DNA']
print(retro.shape)
print(dna.shape)

(35663, 3)
(14960, 3)


In [36]:
non = non.sample(14000)
retro = retro.sample(14000)
dna = dna.sample(14000)

In [37]:
estratificado = pd.concat([non,retro,dna],ignore_index=True)

In [38]:
estratificado['class'].value_counts()

DNA      14000
NT       14000
Retro    14000
Name: class, dtype: int64

In [43]:
estratificado.to_csv('tds_class_stratified_20210308_v1.csv', index = False)

tes dataset

In [17]:
senso = pd.read_csv('db_te_classified_v2.1.csv')
antisenso = senso = pd.read_csv('db_te_classified_v2.1_revcomp.csv')

In [42]:
as_cryptdir = antisenso.loc[(antisenso['order']=='DIRS') | (antisenso['order']=='Crypton')]
as_cryptdir

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
2934,C6_TC_revcomp,1,1433,1433,Retro,DIRS,Viper,10485,Trypanosoma cruzi,gaattccgtggaccattagcatggtccttgtcctcagttaggattt...
3667,CR1-2_HM_revcomp,23,87,65,DNA,Crypton,Crypton,289,Hydra vulgaris,atcctacaaggcagcaggacatgaagcagattgtactgggttacat...
4232,CRYPTON-CN1_revcomp,1,4004,4004,DNA,Crypton,Crypton,37794,Cryptococcus sp.,gtcccgagagcaagtgtatcggttggactcttacataatgtttatt...
4410,CcNgaro3_revcomp,1,6131,6131,Retro,DIRS,Ngaro,56720,Coprinopsis cinerea,acaagataggctcgttctctcgagagccgtgacgaagcgtgaagcc...
7881,Copia-3_ADe-I_revcomp,1,10259,10259,Retro,DIRS,Ngaro,92504,Auricularia delicata,acttatgggcacttcgagaggacgcgtaggccgacgagaacggagt...
...,...,...,...,...,...,...,...,...,...,...
45238,TE-X-6B_DR_revcomp,1,574,574,DNA,Crypton,Crypton,5388,Danio rerio,tccatggtctgttgaaattcttgattctgattggctggaaggtgtg...
45239,TE-X-6_DR_revcomp,1,1554,1554,DNA,Crypton,Crypton,13489,Danio rerio,cttattccatggtctgttgaaattcttgattctgattggctggagg...
45898,TcVIPER_revcomp,1,4390,4390,Retro,DIRS,Viper,40743,Trypanosoma cruzi,agaataatcttccgggcagctggccggatcctgaaattatctatgg...
46707,UCON62_revcomp,2,124,123,DNA,Crypton,Crypton,897,Mammalia,tcagcaataatctagttctctcactgggataaactactctgccttt...


In [43]:
juntado = pd.concat([senso,as_cryptdir],ignore_index=True)
juntado

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,5S_revcomp,1,119,119,Retro,SINE,5S,844,Homo sapiens,aaccctacagaacccggtgttcccaggctctctcccatcaggtacc...
1,5S-Sauria_revcomp,1,348,348,Retro,SINE,5S,3230,Anolis carolinensis,aaaggtaaaggttttcccctgacattaagtccagtcatgtccgact...
2,AACOPIA1_I_revcomp,1,4110,4110,Retro,LTR,Copia,38433,Aedes aegypti,ctcctcctcagcggttacctgcttcactgcggatcctccagtccca...
3,AACOPIA1_LTR_revcomp,1,270,270,Retro,LTR,Copia,2518,Aedes aegypti,tgttagcagaaacaggcaacacagcggatcgggggattcgggagaa...
4,AARA8_AG_revcomp,1,4188,4188,Retro,LINE,I,38902,Anopheles gambiae,gtgtctttccaggctgttgtgatggtttgatggttcgaagaaagaa...
...,...,...,...,...,...,...,...,...,...,...
51392,TE-X-6B_DR_revcomp,1,574,574,DNA,Crypton,Crypton,5388,Danio rerio,tccatggtctgttgaaattcttgattctgattggctggaaggtgtg...
51393,TE-X-6_DR_revcomp,1,1554,1554,DNA,Crypton,Crypton,13489,Danio rerio,cttattccatggtctgttgaaattcttgattctgattggctggagg...
51394,TcVIPER_revcomp,1,4390,4390,Retro,DIRS,Viper,40743,Trypanosoma cruzi,agaataatcttccgggcagctggccggatcctgaaattatctatgg...
51395,UCON62_revcomp,2,124,123,DNA,Crypton,Crypton,897,Mammalia,tcagcaataatctagttctctcactgggataaactactctgccttt...


In [45]:
juntado['order'].value_counts()

LTR         26827
TIR         13252
LINE         7008
dnaSC2       1380
DIRS          892
SINE          729
Crypton       656
Penelope      653
Name: order, dtype: int64

In [52]:
','.join(juntado['order'].unique()).lower()

'sine,ltr,line,tir,dnasc2,penelope,dirs,crypton'

In [44]:
sine = juntado.loc[juntado['order']=='SINE']
ltr = juntado.loc[juntado['order']=='LTR']
line = juntado.loc[juntado['order']=='LINE']
tir = juntado.loc[juntado['order']=='TIR']
dnasc2 = juntado.loc[juntado['order']=='dnaSC2']
penelope = juntado.loc[juntado['order']=='Penelope']
dirs = juntado.loc[juntado['order']=='DIRS']
crypton = juntado.loc[juntado['order']=='Crypton']

In [48]:
sine = sine.sample(653)
ltr = ltr.sample(653)
line = line.sample(653)
tir = tir.sample(653)
dnasc2 = dnasc2.sample(653)
penelope = penelope.sample(653)
dirs = dirs.sample(653)
crypton = crypton.sample(653)

In [54]:
estrat_juntado = pd.concat([sine,ltr,line,tir,dnasc2,penelope,dirs,crypton],ignore_index=True)

In [55]:
estrat_juntado

Unnamed: 0,id,start,end,length,class,order,family,score,species,sequence
0,MonoRep87A_revcomp,1,523,523,Retro,SINE,tRNA,4742,Ornithorhynchus,atgatgatgatggagtctcccgtcgggaacgacggacgagcctcgg...
1,STRID3_revcomp,1,131,131,Retro,SINE,ID,1257,Ictidomys tridecemlineatus,attttttttaatatttattttttagttctcggcggacacaacatct...
2,ALU_revcomp,1,283,283,Retro,SINE,Alu,2498,Homo sapiens,tgagacagggtctcgctctgtcgcccaggctggagtgcagtggcgc...
3,CoeSINE3_revcomp,1,296,296,Retro,SINE,tRNA,2752,Latimeria chalumnae,agttaagttaagtccttcccgagcctatttaggctcatagggccgg...
4,BivaMD-SINE1_TeGr_revcomp,1,345,345,Retro,SINE,tRNA,3247,Tegillarca granosa,tgagttgagttgagtaatggtgtttaacgtcccagtcaagaatatt...
...,...,...,...,...,...,...,...,...,...,...
5219,CryptonV-3_DR_revcomp,8262,23453,15192,DNA,Crypton,Crypton,212561,Danio rerio,cacttgcgcacctatctaagaggggaccatgtcgtggttacacacc...
5220,Crypton-10N1_CGi_revcomp,1,319,319,DNA,Crypton,Crypton,2934,Crassostrea gigas,taaggaataaggaatcattctttgagtattatgaggtgataatttc...
5221,CryptonV-N1B_SSa_revcomp,187,255,69,DNA,Crypton,Crypton,1527,Salmo salar,gctttgtcttactatccttgtggggactttttgggaccaacaattg...
5222,CryptonS-2_PI_revcomp,1,3197,3197,DNA,Crypton,Crypton,29866,Phytophthora infestans,ccataccaaaatttatcggaccctcggacccatcggacctgtacct...


In [58]:
estrat_juntado.to_csv('tds_te_orders_estrat_20210308_v1.csv', index=False)