In [1]:
# Needed modules
import pandas as pd
import os
import subprocess
from Bio import SeqIO


In [2]:
# Import filtered negative data from consensus + LmSIDER2A|B data
neg_data = pd.read_csv("./data/consensus+LmSIDER2A+B/negatives_testing_elements.csv", sep=",", header=0)
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(714, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,113760,114388,plus,CAGCGCCATGCACGACATGGCCGCTGACGTCCGTAGCCCTAACTCG...
1,LinJ.01,261866,262439,plus,CGGACTTGGCAAGTGGCCGCCATCGATGAAAACGCACCATGCCTTT...
2,LinJ.01,272496,275901,plus,GTCAGAACAACGAAGCCGCCCCCCATCACTGCCTCCCTCTACGTAC...
3,LinJ.02,95421,95743,plus,TCCGCGATCCGTGCAGTTGGCGCCGGCCCCTCCTTCACTGCCGATG...
4,LinJ.02,97368,97489,plus,TCGAGCTCAGCTCTGGGGCACGGTTTGGCGGGCGTGGAAGTGGGCC...


In [3]:
# Import matches from LmSIDER2A|B data with LmSIDER2A|B hallmarks
hallmark_matches = pd.read_csv("./data/consensus+LmSIDER2A+B/negatives_testing_elements_matches_with_LmSIDER2A+B.csv", sep=",", header=0)
print(hallmark_matches.shape)
print(hallmark_matches.dtypes)
hallmark_matches.head()

(85, 12)
qseqid       object
sseqid       object
pident      float64
length        int64
qstart        int64
qend          int64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen          int64
slen          int64
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
0,LmSIDER2a,Seq_646_LinJ.35,90.476,42,3,42,266,225,1.08e-09,54.7,79,677
1,LmSIDER2a,Seq_681_LinJ.36,80.0,65,14,78,200,139,6.47e-07,45.4,79,610
2,LmSIDER2a,Seq_294_LinJ.25,78.571,70,1,68,1,70,6.47e-07,45.4,79,139
3,LmSIDER2a,Seq_400_LinJ.31,79.31,58,6,63,345,292,0.000108,38.1,79,669
4,LmSIDER2a,Seq_340_LinJ.28,78.689,61,14,74,15,72,0.000108,38.1,79,686


In [4]:
# Filter matches with evalue <= 10^-3
hallmark_matches = hallmark_matches[hallmark_matches["evalue"] <= 10**-3].sort_values(by=["evalue"])
print(hallmark_matches.shape)
print(hallmark_matches.dtypes)
hallmark_matches.head()

(27, 12)
qseqid       object
sseqid       object
pident      float64
length        int64
qstart        int64
qend          int64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen          int64
slen          int64
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
56,LmSIDER2b,Seq_189_LinJ.16,82.54,63,1,62,352,290,1.04e-09,54.7,77,770
55,LmSIDER2b,Seq_190_LinJ.16,82.54,63,1,62,351,289,1.04e-09,54.7,77,770
0,LmSIDER2a,Seq_646_LinJ.35,90.476,42,3,42,266,225,1.08e-09,54.7,79,677
57,LmSIDER2b,Seq_281_LinJ.24,82.54,63,1,61,90,149,1.35e-08,51.0,77,155
58,LmSIDER2b,Seq_280_LinJ.24,82.54,63,1,61,90,149,1.35e-08,51.0,77,155


In [5]:
# Create a column with the number in "sseqid"
hallmark_matches["index"] = hallmark_matches["sseqid"].str.extract(r"_(\d+)_")
hallmark_matches["index"] = hallmark_matches["index"].astype(int)
hallmark_matches.head()

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,index
56,LmSIDER2b,Seq_189_LinJ.16,82.54,63,1,62,352,290,1.04e-09,54.7,77,770,189
55,LmSIDER2b,Seq_190_LinJ.16,82.54,63,1,62,351,289,1.04e-09,54.7,77,770,190
0,LmSIDER2a,Seq_646_LinJ.35,90.476,42,3,42,266,225,1.08e-09,54.7,79,677,646
57,LmSIDER2b,Seq_281_LinJ.24,82.54,63,1,61,90,149,1.35e-08,51.0,77,155,281
58,LmSIDER2b,Seq_280_LinJ.24,82.54,63,1,61,90,149,1.35e-08,51.0,77,155,280


In [6]:
# Get a list with the index column, sort it, and remove duplicates
index_list = hallmark_matches["index"].sort_values().unique().tolist()
print(len(index_list))
print(index_list)

23
[11, 13, 26, 27, 28, 29, 102, 189, 190, 205, 206, 207, 211, 279, 280, 281, 294, 340, 378, 400, 646, 671, 681]


In [7]:
# Define function to extract fasta sequences
def fasta_extractor(pathfile, outfile, extract_list):
    with open(outfile, "w") as out_file:
        # Remember "enumerate" starts in "1"
        for count, fasta in enumerate(SeqIO.parse(open(pathfile), "fasta"), start=0):  # from Bio import SeqIO
            # name, sequence = fasta.id, str(fasta.seq)
            if count in extract_list:
                SeqIO.write(fasta, out_file, "fasta")



In [None]:
# Extract fasta sequences
fasta_extractor(pathfile="./data/consensus+LmSIDER2A+B/negatives_testing_elements.fasta",
                outfile="./data/consensus+LmSIDER2A+B/neg_matched_with_LmSIDER2A+B_and_evalue_10pow-3.fasta",
                extract_list=index_list)

In [11]:
# Extract csv
recaught_data = neg_data[neg_data.index.isin(index_list)]
print(recaught_data.shape)
print(recaught_data.dtypes)
recaught_data.head()

(23, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
11,LinJ.02,259043,259677,plus,CCCCATTGCGTGGCGCGAAGCGGCGCTAGACGCGCACGAAAACACC...
13,LinJ.03,237552,237702,plus,GTGCGGGGGAGCCAGGCAGCCCACCCACCCACCCATCCCCTATCCC...
26,LinJ.04,331481,332011,plus,GTCGCTCTGACCTCCCCCTCTCACGCCCTAGGCACCCTGGCCCTGC...
27,LinJ.04,362873,363525,plus,CATCCGGATGACAGGTGGGGGAGGGGGTGCGCACACCTCAGCGCGT...
28,LinJ.04,391322,391906,plus,GTCGCTCTGACCTCCCCCTCTCACGCCCTAGGCACCCTGGCCCTGC...


In [12]:
recaught_data.to_csv("./data/consensus+LmSIDER2A+B/neg_matched_with_LmSIDER2A+B_and_evalue_10pow-3.csv", sep=",", header=True, index=False)