In [1]:
from pyfaidx import Fasta
import pandas as pd
# Input files
FASTA='RepeatMasker_masked_sequences.fasta'
COORD='db_te_class_beta_1.csv'
# Read fasta
tes = Fasta(FASTA)

In [2]:
df = pd.read_csv(COORD,usecols=('id','start','end'))
df

Unnamed: 0,id,start,end
0,5S,1,119
1,5S-Sauria,1,348
2,5S_CPo,122,136
3,AACOPIA1_I,1,4110
4,AACOPIA1_LTR,1,270
...,...,...,...
56662,X9c_DNA_2,5,287
56663,hAT-N39_DR_1,1,1248
56664,hAT-N39_DR_1,1346,1594
56665,nhAT1_ML_1,1,235


In [3]:
df.loc[df.duplicated()]

Unnamed: 0,id,start,end


In [62]:
df.drop_duplicates(inplace=True, ignore_index=True)

In [4]:
valores = df.values.tolist()
valores[1]

['5S-Sauria', 1, 348]

In [5]:
for item in tes.keys():
    if 'TIGGER2' in item:
        print(item)

TIGGER2
TIGGER2_1
TIGGER2_2
TIGGER2_3
TIGGER2_4


In [6]:
with open('db_te_class_beta_1.fasta','w') as fout:    
    for fcoord in valores:
        nome, comeco, fim = fcoord
        fseq = tes.get_seq(nome,comeco,fim)
        fseq = str(fseq)
        ffull = f'>{nome}:{comeco}-{fim}\n{fseq}\n'
        fout.write(ffull)
        fcoord.append(fseq)

In [7]:
print(valores[1])

['5S-Sauria', 1, 348, 'gcctacggccataccaccctgaacacgcccgatctcgtctgatctcggaagctaagcagggtcgggcctggttagtacttggatgggagaccgcctgggaataccgggtgctgtaggctttagccccagcttctgccaacctagcagttcgaaaacatgcaaatgtgagtagatcaataggtaccgctccggcgggaaggtaacggcgctccatgcagtcatgccggccacatgaccttggaggtgtctacggacaacgccggctcttcggcttagaaatggagatgagcaccaacccccagagtcggacatgactggacttaatgtcaggggaaaacctttaccttt']


In [8]:
novo_df = pd.DataFrame(valores,columns=['id', 'start', 'end', 'sequence'])
novo_df.head()

Unnamed: 0,id,start,end,sequence
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,5S_CPo,122,136,aaaaaaaaaaaaaaa
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...


In [9]:
novo_df['length'] = novo_df['sequence'].apply(len)
novo_df.head()

Unnamed: 0,id,start,end,sequence,length
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348
2,5S_CPo,122,136,aaaaaaaaaaaaaaa,15
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270


In [10]:
unwnts = 'bdhkmrsvwxy'
for unwnt in unwnts:
    novo_df['sequence']=novo_df['sequence'].str.replace(unwnt,'n')

In [11]:
novo_df.loc[novo_df['sequence'].str.contains('n')]

Unnamed: 0,id,start,end,sequence,length
5,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188
11,ACASINE2,1,474,ggggctgcggtggcacaatgggttaaacccttgtgctgctgaactg...,474
22,ACROBAT1,864,1379,ntagtgacagaataaaattgattgtttttggtcaattttgacagct...,516
24,ACSINE1,1,214,tgcaaaggatcgttaccttgacgcggtgctggggcttgngcgcctc...,214
29,AFROSINE,1,160,tgctaaccaaaaggtcggcagttcgaaaccaccagctgctccangg...,160
...,...,...,...,...,...
56661,X9c_DNA_1,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56662,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56663,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248
56665,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235


In [12]:
novo_df.loc[novo_df['id'].str.contains('TIGGER2')]

Unnamed: 0,id,start,end,sequence,length
45216,TIGGER2,1,2708,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2708
55800,TIGGER2_1,1,2718,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2718
55801,TIGGER2_2,1,2708,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2708
55802,TIGGER2_3,1,2718,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2718
55803,TIGGER2_4,1,2708,cagttgacccttgaacaacacgggtttgaactgcgcgggtccactt...,2708


In [13]:
print(novo_df.loc[novo_df['sequence'].str.count('n').idxmax()]) # idxmax() returns the index
contagem = novo_df["sequence"].str.count('n').max() # max() returns the max value
print(f'There is {contagem} occurences of "n" in this sequence')

id                                                 AviRTE_ZCh
start                                                     546
end                                                      3383
sequence    agcatcctgaatgaccaagcagcctttttcaaggagagcactgctt...
length                                                   2838
Name: 759, dtype: object
There is 677 occurences of "n" in this sequence


In [14]:
novo_df.loc[novo_df['length'] <= 50]

Unnamed: 0,id,start,end,sequence,length
2,5S_CPo,122,136,aaaaaaaaaaaaaaa,15
40,AGM1,1,42,tcgttgctaaccaaaaccatccaacgaaaaacacattcaaat,42
495,ATMU10,3176,3214,gctctgtcccaaccgtctctgtcccaacagtcccaaaga,39
816,BARE1_HV,7079,7103,attgactctagtgcaagtgggagac,25
843,BEL-1-LTR_NVi,945,969,atttgaactcattagcttttgttca,25
...,...,...,...,...,...
56319,hAT-N18_ZM_1,1,14,tagggctgggcaaa,14
56373,CR1-1_HM_1,27,37,tcatgatcatc,11
56374,CR1-1_HM_1,82,98,tgctgccttgcaggatt,17
56380,DNA-2-32_DR_1,1,25,cccttacagaaaagtcacatgaatt,25


In [15]:
novo_df=novo_df.loc[novo_df['length'] >= 50]
novo_df

Unnamed: 0,id,start,end,sequence,length
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270
5,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188
...,...,...,...,...,...
56662,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56663,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248
56664,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249
56665,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235


In [16]:
frequencia = (novo_df['sequence'].str.count('n')/novo_df['length'])*100
print(novo_df.loc[frequencia.idxmax()])
print(frequencia.max())

id                                                 AviRTE_ZCh
start                                                    3721
end                                                      3992
sequence    tggtctaccctggcctccaatcgggaggcctggagacacaccatct...
length                                                    272
Name: 760, dtype: object
28.676470588235293


In [17]:
full_df = pd.read_csv(COORD)
full_df.loc[full_df['id'].isin(novo_df.loc[frequencia >= 15]['id'])]

Unnamed: 0,id,class,order,family,score,start,end
758,AviRTE_ZCh,Retro,LINE,RTE,1396,81,232
759,AviRTE_ZCh,Retro,LINE,RTE,16839,546,3383
760,AviRTE_ZCh,Retro,LINE,RTE,1358,3721,3992
29285,Helitron-N3_NV,DNA,dnaSC2,Helitron,6946,1,921
53000,Hoana8_1,DNA,TIR,hAT,17959,1,1928
53001,Hoana8_1,DNA,TIR,hAT,1607,2200,2371
53002,Hoana8_1,DNA,TIR,hAT,6547,2417,3123
53003,Hoana8_1,DNA,dnaSC2,Helitron,1229,2083,2333


In [20]:
novo_df.loc[frequencia >= 15]

Unnamed: 0,id,start,end,sequence,length
759,AviRTE_ZCh,546,3383,agcatcctgaatgaccaagcagcctttttcaaggagagcactgctt...,2838
760,AviRTE_ZCh,3721,3992,tggtctaccctggcctccaatcgggaggcctggagacacaccatct...,272
29285,Helitron-N3_NV,1,921,ctcgggctacctgtggtagcacgagtctaggtttacctttcctata...,921
53003,Hoana8_1,2083,2333,gtgttttttaagttagagggttgggactttccacacatgttatatt...,251


In [22]:
full_df.loc[full_df['id'].isin(novo_df.loc[frequencia >= 15]['id'])]['order'].value_counts()

Series([], Name: order, dtype: int64)

In [23]:
full_df['order'].value_counts()

LTR         29635
TIR         15180
LINE         7811
dnaSC2       1505
SINE         1018
Penelope      676
DIRS          480
Crypton       362
Name: order, dtype: int64

In [21]:
novo_df=novo_df.loc[frequencia <= 15]
novo_df

Unnamed: 0,id,start,end,sequence,length
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348
3,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110
4,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270
5,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188
...,...,...,...,...,...
56662,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283
56663,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248
56664,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249
56665,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235


In [25]:
merged = pd.merge(novo_df,full_df,on=['id','start','end'],how='inner')
merged

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119,Retro,SINE,5S,844
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348,Retro,SINE,5S,3230
2,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110,Retro,LTR,Copia,38433
3,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270,Retro,LTR,Copia,2518
4,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188,Retro,LINE,I,38902
...,...,...,...,...,...,...,...,...,...
56240,X9c_DNA_2,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283,DNA,TIR,hAT,2291
56241,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248,DNA,TIR,hAT,13580
56242,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249,DNA,TIR,hAT,13580
56243,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235,DNA,TIR,hAT,2167


In [28]:
merged.loc[merged['id'].str.contains('TIGGER1')]

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
44898,TIGGER1,1,2418,caggcatacctcgttttattgcgcttcgctttattgcgcttcgcag...,2418,DNA,TIR,TcMar,22332
55386,TIGGER1_1,1,2418,caggcatacctcgttttattgcgcttcgctttattgcgcttcgcag...,2418,DNA,TIR,TcMar,22332
55387,TIGGER1_2,1,2418,caggcatacctcgttttattgcgcttcgctttattgcgcttcgcag...,2418,DNA,TIR,TcMar,22332


In [234]:
teste = pd.merge(novo_df,merged, indicator=True, how='outer')
teste_diff = teste.loc[teste['_merge'] != 'both']
teste_diff

Unnamed: 0,id,start,end,sequence,length,class,order,family,species,score,_merge


In [29]:
merged.to_csv('db_te_classified_1.0v.csv',index=False)

In [32]:
dup_seqs = merged.loc[merged['sequence'].duplicated()]
dup_seqs

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
135,ATCOPIA23I,1,4502,tggtatcagagcagtgattctaaaagacctaatttttttttttttt...,4502,Retro,LTR,Copia,42203
136,ATCOPIA23LTR,1,120,tataaggagttgtatatattaaggatataattgtcattagtagttg...,120,Retro,LTR,Copia,1063
322,ATDNA12T3_2,2,358,tgtaacgccccgaccgcccctaccagtgggccccacgtcctctctc...,357,Retro,LTR,Cassandra,3186
323,ATDNA12T3_2,359,470,attcacccccactaacagatcgcaacgtcctcgttgcacaccaaga...,112,Retro,LTR,Cassandra,866
356,ATGP1I,1,5977,atttggtatcagagcgattacggttctaggatgtgtagaaaaatta...,5977,Retro,LTR,Gypsy,54683
...,...,...,...,...,...,...,...,...,...
56236,X1_DNA_2,1,606,tatgcaaaggatatattcctgaagancttgcataattcaaaactta...,606,DNA,TIR,TcMar,4423
56237,X21_DNA_1,14,159,atctccttgtctgttttctgcctgcatagtcatgcacagctttctg...,146,DNA,TIR,hAT,1028
56238,X21_DNA_2,14,159,atctccttgtctgttttctgcctgcatagtcatgcacagctttctg...,146,DNA,TIR,hAT,1028
56239,X9c_DNA_1,5,287,gaaaacagcttgtctgcttttattttcaaacttcaaagtttatttt...,283,DNA,TIR,hAT,2291


In [46]:
merged.shape

(56245, 9)

In [47]:
merged_no_dup_seq = merged.loc[~merged['sequence'].duplicated()]
merged_no_dup_seq

Unnamed: 0,id,start,end,sequence,length,class,order,family,score
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119,Retro,SINE,5S,844
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348,Retro,SINE,5S,3230
2,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110,Retro,LTR,Copia,38433
3,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270,Retro,LTR,Copia,2518
4,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188,Retro,LINE,I,38902
...,...,...,...,...,...,...,...,...,...
56193,UCON15_1,1,165,ttcatctaaggancaaacaccttaagctgttgttttcaagttttat...,165,DNA,TIR,hAT,1255
56241,hAT-N39_DR_1,1,1248,cagggccggattaaccaatgggccttatgggcacaggcccaggggc...,1248,DNA,TIR,hAT,13580
56242,hAT-N39_DR_1,1346,1594,ttttttgtagttttattagtatttttatttaatgctttgaaaaata...,249,DNA,TIR,hAT,13580
56243,nhAT1_ML_1,1,235,cagtgatggcgaacctatgacacgcgtgtcagaggtgacacgcgaa...,235,DNA,TIR,hAT,2167


In [48]:
merged_no_dup_seq.to_csv('db_te_classified_1.1v_nds.csv',index=False)

In [52]:
spps = pd.read_csv('tes_species.csv',names = ('id','species'))
spps

Unnamed: 0,id,species
0,4.5SRNA,Rodentia
1,5S_CPo_1,Cavia porcellus
2,5S_CPo,Cavia porcellus
3,5S_DM,Drosophila melanogaster
4,5S,Homo sapiens
...,...,...
55887,ZOMBI,Homo sapiens
55888,Zorro,Candida albicans
55889,ZP3AR_MM,Mus musculus
55890,ZP3AR,Muridae


In [55]:
spp_merged = merged_no_dup_seq.merge(spps, on = 'id', how = 'inner', sort = True)
spp_merged

Unnamed: 0,id,start,end,sequence,length,class,order,family,score,species
0,5S,1,119,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...,119,Retro,SINE,5S,844,Homo sapiens
1,5S-Sauria,1,348,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...,348,Retro,SINE,5S,3230,Anolis carolinensis
2,AACOPIA1_I,1,4110,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...,4110,Retro,LTR,Copia,38433,Aedes aegypti
3,AACOPIA1_LTR,1,270,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...,270,Retro,LTR,Copia,2518,Aedes aegypti
4,AARA8_AG,1,4188,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...,4188,Retro,LINE,I,38902,Anopheles gambiae
...,...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,tgtaacagccctaggttcataaccaggtatttagaagtgtatttgt...,1138,Retro,LTR,Gypsy,10061,Oryza sativa
50619,rn_364-201_LTR_1,1,3868,tgtaacagccctaggttcataaccaggtatttagaagtgcttttgt...,3868,Retro,LTR,Gypsy,36247,Oryza sativa
50620,tRNA-Lys-AAG,1,76,gcccggctagctcagtcggtagagcatgagactcttaatctcaggg...,76,Retro,LTR,Gypsy,685,Vertebrata
50621,tSINE_Fc,1,110,ggggcgcctggctggctcagtcggtagagcatgcgactcttgatct...,110,Retro,SINE,tRNA,988,Felis catus


In [56]:
spp_merged.columns

Index(['id', 'start', 'end', 'sequence', 'length', 'class', 'order', 'family',
       'score', 'species'],
      dtype='object')

In [58]:
spp_merged = spp_merged[['id', 'start', 'end', 'length', 'class', 'order', 'family',
       'score', 'sequence']]
spp_merged

Unnamed: 0,id,start,end,length,class,order,family,score,sequence
0,5S,1,119,119,Retro,SINE,5S,844,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...
...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,tgtaacagccctaggttcataaccaggtatttagaagtgtatttgt...
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,tgtaacagccctaggttcataaccaggtatttagaagtgcttttgt...
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,gcccggctagctcagtcggtagagcatgagactcttaatctcaggg...
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,ggggcgcctggctggctcagtcggtagagcatgcgactcttgatct...


In [59]:
spp_merged.to_csv('db_te_classified_2.0v.csv',index=False)

In [65]:
print(spp_merged['class'].value_counts())
print('====================')
print(spp_merged['order'].value_counts())
print('====================')
print(spp_merged['family'].value_counts())

Retro    35663
DNA      14960
Name: class, dtype: int64
LTR         26827
TIR         13252
LINE         7008
dnaSC2       1380
SINE          729
Penelope      653
DIRS          446
Crypton       328
Name: order, dtype: int64
Gypsy    12348
Copia     7322
hAT       3820
TcMar     2985
ERV1      2611
         ...  
Genie        3
SVA          2
Ceph         2
R4           2
TRIM         2
Name: family, Length: 64, dtype: int64


In [66]:
def reverse_complement(dna):
    complement = {'a':'t','c':'g','g':'c','t':'a','n':'n'}
    return ''.join([complement[base] for base in dna[::-1]])

In [68]:
copiado = spp_merged.copy()
copiado

Unnamed: 0,id,start,end,length,class,order,family,score,sequence
0,5S,1,119,119,Retro,SINE,5S,844,gtctactgccataccaccctgaacacgcccgatctcatctgatctt...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,gcctacggccataccaccctgaacacgcccgatctcgtctgatctc...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,ggtgatgggcccagcgcaaggcccccgcgggatttgtgaaagtgaa...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,tgtggagaatgcatcggtgtaccccttcactactgcacgaataccc...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,ttcactgtaaactgtcgcctacctgcactaactgtggtacccctgc...
...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,tgtaacagccctaggttcataaccaggtatttagaagtgtatttgt...
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,tgtaacagccctaggttcataaccaggtatttagaagtgcttttgt...
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,gcccggctagctcagtcggtagagcatgagactcttaatctcaggg...
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,ggggcgcctggctggctcagtcggtagagcatgcgactcttgatct...


In [69]:
copiado['sequence'] = copiado['sequence'].apply(reverse_complement)
copiado

Unnamed: 0,id,start,end,length,class,order,family,score,sequence
0,5S,1,119,119,Retro,SINE,5S,844,aaccctacagaacccggtgttcccaggctctctcccatcaggtacc...
1,5S-Sauria,1,348,348,Retro,SINE,5S,3230,aaaggtaaaggttttcccctgacattaagtccagtcatgtccgact...
2,AACOPIA1_I,1,4110,4110,Retro,LTR,Copia,38433,ctcctcctcagcggttacctgcttcactgcggatcctccagtccca...
3,AACOPIA1_LTR,1,270,270,Retro,LTR,Copia,2518,tgttagcagaaacaggcaacacagcggatcgggggattcgggagaa...
4,AARA8_AG,1,4188,4188,Retro,LINE,I,38902,gtgtctttccaggctgttgtgatggtttgatggttcgaagaaagaa...
...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR,1,1138,1138,Retro,LTR,Gypsy,10061,gctcacatggcagcaacctggacgtagggtcgtgagagactaaggc...
50619,rn_364-201_LTR_1,1,3868,3868,Retro,LTR,Gypsy,36247,tgtggacaccccgagtattataccatccagtgtccaacgcgaccac...
50620,tRNA-Lys-AAG,1,76,76,Retro,LTR,Gypsy,685,tggcgcccaacgtggggctcgaacccacgaccctgagattaagagt...
50621,tSINE_Fc,1,110,110,Retro,SINE,tRNA,988,tttttaaagattttatttttaagtaatctctacacccaacgtgggg...


In [74]:
copiado['id'] = copiado['id'] + '_revcomp'
copiado

Unnamed: 0,id,start,end,length,class,order,family,score,sequence
0,5S_revcomp,1,119,119,Retro,SINE,5S,844,aaccctacagaacccggtgttcccaggctctctcccatcaggtacc...
1,5S-Sauria_revcomp,1,348,348,Retro,SINE,5S,3230,aaaggtaaaggttttcccctgacattaagtccagtcatgtccgact...
2,AACOPIA1_I_revcomp,1,4110,4110,Retro,LTR,Copia,38433,ctcctcctcagcggttacctgcttcactgcggatcctccagtccca...
3,AACOPIA1_LTR_revcomp,1,270,270,Retro,LTR,Copia,2518,tgttagcagaaacaggcaacacagcggatcgggggattcgggagaa...
4,AARA8_AG_revcomp,1,4188,4188,Retro,LINE,I,38902,gtgtctttccaggctgttgtgatggtttgatggttcgaagaaagaa...
...,...,...,...,...,...,...,...,...,...
50618,rn_364-201_LTR_revcomp,1,1138,1138,Retro,LTR,Gypsy,10061,gctcacatggcagcaacctggacgtagggtcgtgagagactaaggc...
50619,rn_364-201_LTR_1_revcomp,1,3868,3868,Retro,LTR,Gypsy,36247,tgtggacaccccgagtattataccatccagtgtccaacgcgaccac...
50620,tRNA-Lys-AAG_revcomp,1,76,76,Retro,LTR,Gypsy,685,tggcgcccaacgtggggctcgaacccacgaccctgagattaagagt...
50621,tSINE_Fc_revcomp,1,110,110,Retro,SINE,tRNA,988,tttttaaagattttatttttaagtaatctctacacccaacgtgggg...


In [75]:
copiado.to_csv('db_te_classified_2.0v_revcomp.csv', index=False)