In [1]:
# Needed modules
import pandas as pd
import os
import subprocess

In [2]:
# Importing data needed
data = pd.read_csv("./data/Results_Program_LmSIDERs_30-05-54.csv", sep=",", header=0)
print(data.shape)
print(data.dtypes)
data.head()

(4886, 14)
qseqid      float64
sseqid       object
pident      float64
length        int64
qstart      float64
qend        float64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen        float64
slen        float64
sstrand      object
sseq         object
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,,LinJ.01,,1000,,,1000,1,,,,,minus,GTTCTATCCATCGACCTGCACCTGCACACATGAGCTGCAAAAAGTT...
1,,LinJ.01,,784,,,24876,24093,,,,,minus,CTCCTGTCTGAGAGCGGTGTGGCGCATGGTGCCGCGCCCTTTGCGC...
2,,LinJ.01,,927,,,36297,35371,,,,,minus,GTGGGCCTCTCCGTGTCTCCGTGCCGTCTGCTTTCCCTCTTCTCAA...
3,,LinJ.01,,806,,,40595,39790,,,,,minus,CCCCGCCCCTTGGCTGGCATGGACGGAAATGGACGATGAAGACAGC...
4,,LinJ.01,,927,,,55909,54983,,,,,minus,CCTTGCACAGCGGATTTCCATAGGCTTCTCTCAGCTCAAGGAAATC...


In [3]:
# Check coor. in "minus" strand, where start > end
data[data["sstrand"] == "minus"].head()

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,,LinJ.01,,1000,,,1000,1,,,,,minus,GTTCTATCCATCGACCTGCACCTGCACACATGAGCTGCAAAAAGTT...
1,,LinJ.01,,784,,,24876,24093,,,,,minus,CTCCTGTCTGAGAGCGGTGTGGCGCATGGTGCCGCGCCCTTTGCGC...
2,,LinJ.01,,927,,,36297,35371,,,,,minus,GTGGGCCTCTCCGTGTCTCCGTGCCGTCTGCTTTCCCTCTTCTCAA...
3,,LinJ.01,,806,,,40595,39790,,,,,minus,CCCCGCCCCTTGGCTGGCATGGACGGAAATGGACGATGAAGACAGC...
4,,LinJ.01,,927,,,55909,54983,,,,,minus,CCTTGCACAGCGGATTTCCATAGGCTTCTCTCAGCTCAAGGAAATC...


In [4]:
# Change coor so that start < end
data.loc[data['sstrand'] == 'minus', ['sstart', 'send']] = data.loc[data['sstrand'] == 'minus', ['send', 'sstart']].values

In [5]:
# Check if it got fixed
data[data["sstrand"] == "minus"].head()

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,,LinJ.01,,1000,,,1,1000,,,,,minus,GTTCTATCCATCGACCTGCACCTGCACACATGAGCTGCAAAAAGTT...
1,,LinJ.01,,784,,,24093,24876,,,,,minus,CTCCTGTCTGAGAGCGGTGTGGCGCATGGTGCCGCGCCCTTTGCGC...
2,,LinJ.01,,927,,,35371,36297,,,,,minus,GTGGGCCTCTCCGTGTCTCCGTGCCGTCTGCTTTCCCTCTTCTCAA...
3,,LinJ.01,,806,,,39790,40595,,,,,minus,CCCCGCCCCTTGGCTGGCATGGACGGAAATGGACGATGAAGACAGC...
4,,LinJ.01,,927,,,54983,55909,,,,,minus,CCTTGCACAGCGGATTTCCATAGGCTTCTCTCAGCTCAAGGAAATC...


In [6]:
# Check if every "sstart" is < "data"
## Sum all true values
print(f"""
From {data.shape[0]} rows, there are {sum(data["sstart"] < data["send"])} rows where "sstart" < "send"
      """)


From 4886 rows, there are 4886 rows where "sstart" < "send"
      


In [7]:
# Take only what's needed
data_2 = data[["sseqid", "sstart", "send"]].copy()

In [8]:
# path to bedops files and .bed file creation
path_all_elements_bed = "./bedops_files/all_elements.bed"
data_2.to_csv(path_all_elements_bed, sep="\t", header=False, index=False)

In [9]:
# bedops call
cmd = f"bedops --merge {path_all_elements_bed}"
merged_data = subprocess.check_output(cmd, shell=True, universal_newlines=True)
merged_df = pd.DataFrame([x.split("\t") for x in merged_data.split("\n") if x],
                         columns=["sseqid", "sstart", "send"])
merged_df[['sstart', 'send']] = merged_df[['sstart', 'send']].apply(pd.to_numeric)
print(f"""
From {data_2.shape[0]} rows, there are {merged_df.shape[0]} rows after merging
      """)


From 4886 rows, there are 2951 rows after merging
      


In [10]:
merged_df.head()

Unnamed: 0,sseqid,sstart,send
0,LinJ.01,1,1000
1,LinJ.01,24093,24876
2,LinJ.01,35371,36297
3,LinJ.01,39790,40595
4,LinJ.01,54983,55909


In [34]:
# Now let's get the sequences
def get_data_sequence(data, strand, genome_fasta):
    sequences = []
    for _, row in data.iterrows():
        sseqid = row["sseqid"]
        start = row["sstart"]
        end = row["send"]
        cmd = f"blastdbcmd -db {genome_fasta} -entry {sseqid} -range {start}-{end} -strand {strand} -outfmt %s"

        sequence = subprocess.check_output(cmd, shell=True, universal_newlines=True).replace('\n', '')

        sequences.append({
            "sseqid": sseqid,
            "sstart": start,
            "send": end,
            "sstrand": strand,
            "sseq": sequence
        })

    sequences_df = pd.DataFrame(sequences)
    return sequences_df


In [12]:
# Prepare dict creation
def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [13]:
blastn_dic(path_input="./data/dict/TriTrypDB-67_LinfantumJPCM5_Genome.fasta",
            path_output="./data/dict/TriTrypDB-67_LinfantumJPCM5_Genome.fasta")

In [35]:
# Get sequences
data_after_merge = get_data_sequence(data=merged_df, 
                                     strand="plus", 
                                     genome_fasta="./data/dict/TriTrypDB-67_LinfantumJPCM5_Genome.fasta")
print(f"""
From {merged_df.shape[0]} rows, there are {data_after_merge.shape[0]} rows with sequences
      """)


From 2951 rows, there are 2951 rows with sequences
      


In [36]:
# Check data
print(data_after_merge.shape)
print(data_after_merge.dtypes)
data_after_merge.head()

(2951, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,24876,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [37]:
# inset length column
length_column = data_after_merge["sseq"].apply(len)
data_after_merge.insert(1, "length", length_column)
data_after_merge.head()

Unnamed: 0,sseqid,length,sstart,send,sstrand,sseq
0,LinJ.01,1000,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,784,24093,24876,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,927,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,806,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,927,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [39]:
data_after_merge.to_csv("./data/1.Merged_data.csv", sep=",", index=False)