In [5]:
# Needed modules
import pandas as pd
import os
import subprocess

In [6]:
# Load the consensus data output
consensus_data = pd.read_csv("./data_software_output/Last_Data.csv", sep=",", header=0)
print(consensus_data.shape)
print(consensus_data.dtypes)
consensus_data.head()

(4874, 14)
qseqid      float64
sseqid       object
pident      float64
length        int64
qstart      float64
qend        float64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen        float64
slen        float64
sstrand      object
sseq         object
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,,LinJ.01,,1000,,,1000,1,,,,,minus,GTTCTATCCATCGACCTGCACCTGCACACATGAGCTGCAAAAAGTT...
1,,LinJ.01,,882,,,25080,24199,,,,,minus,CCGGATGCCGGGTTTTTGCTGTGCGTCGATGAGGCGGTGCGGGTGG...
2,,LinJ.01,,890,,,36263,35374,,,,,minus,CCCTCTTCTCAACTCTCGCTGTCGATCATCATCAGGGTCGCCGTCA...
3,,LinJ.01,,777,,,40595,39819,,,,,minus,CCCCGCCCCTTGGCTGGCATGGACGGAAATGGACGATGAAGACAGC...
4,,LinJ.01,,890,,,55875,54986,,,,,minus,CTCAAGGAAATCGGTTGATGAGCCGCATGGCACGGCTCCTATGTCC...


In [7]:
# Check if there are rows with sstart > send
sum(consensus_data["sstart"] > consensus_data["send"])

2940

In [8]:
# Make it so that sstart < send
consensus_data.loc[consensus_data["sstrand"] == "minus", ["sstart", "send"]] = consensus_data.loc[consensus_data["sstrand"] == "minus", ["send", "sstart"]].values

In [9]:
# Check it it worked
sum(consensus_data["sstart"] > consensus_data["send"])

0

In [10]:
# Take only what's needed
consensus_data = consensus_data[["sseqid", "sstart", "send"]].copy()

In [11]:
# Path for bedops files creawtion
path_consensus = "./data_software_output/consensus_seqs.bed"
consensus_data.to_csv(path_consensus, sep="\t", header=False, index=False)  # save data for BEDOPS file

In [13]:
# Bedops call
cmd = f"bedops --merge {path_consensus}"
consensus_data = subprocess.check_output(cmd, shell=True, universal_newlines=True)
consensus_data = pd.DataFrame([x.split("\t") for x in consensus_data.split("\n") if x],
                              columns=["sseqid", "sstart", "send"])
consensus_data[["sstart", "send"]] = consensus_data[["sstart", "send"]].apply(pd.to_numeric)
print(f"There are {consensus_data.shape[0]} consensus sequences after BEDOPS merge")

There are 2940 consensus sequences after BEDOPS merge


In [17]:
# Now let's get the sequences
def get_data_sequence(data, strand, genome_fasta):
    sequences = []
    for _, row in data.iterrows():
        sseqid = row["sseqid"]
        start = row["sstart"]
        end = row["send"]
        cmd = f"blastdbcmd -db {genome_fasta} -entry {sseqid} -range {start}-{end} -strand {strand} -outfmt %s"

        sequence = subprocess.check_output(cmd, shell=True, universal_newlines=True).replace('\n', '')

        sequences.append({
            "sseqid": sseqid,
            "sstart": start,
            "send": end,
            "sstrand": strand,
            "sseq": sequence
        })

    sequences_df = pd.DataFrame(sequences)
    return sequences_df

# Prepare dict creation
def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [20]:
# Prepare dictionary blastn
os.makedirs("./data/L_infantum", exist_ok=True)
blastn_dic(path_input="../0.Data/genome/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta",
           path_output="./data/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta")

In [21]:
consensus_data_seqs = get_data_sequence(data=consensus_data,
                                        strand="plus",
                                        genome_fasta="./data/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
                                        )
print(f"There are {consensus_data_seqs.shape[0]} consensus sequences after getting the sequences")

There are 2940 consensus sequences after getting the sequences


In [22]:
# explore data
print(consensus_data_seqs.shape)
print(consensus_data_seqs.dtypes)
consensus_data_seqs.head()

(2940, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24199,25080,plus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
2,LinJ.01,35374,36263,plus,CCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...
3,LinJ.01,39819,40595,plus,CGCACGCACAGCCACAGCTCACCTGGCACTCTGTTGCACGGCGGCT...
4,LinJ.01,54986,55875,plus,CTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...


In [23]:
# Insert length column
length_column = consensus_data_seqs["sseq"].apply(len)
consensus_data_seqs.insert(1, "length", length_column)
print(consensus_data_seqs.dtypes)
consensus_data_seqs.head()

sseqid     object
length      int64
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,length,sstart,send,sstrand,sseq
0,LinJ.01,1000,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,882,24199,25080,plus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
2,LinJ.01,890,35374,36263,plus,CCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...
3,LinJ.01,777,39819,40595,plus,CGCACGCACAGCCACAGCTCACCTGGCACTCTGTTGCACGGCGGCT...
4,LinJ.01,890,54986,55875,plus,CTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGCAGC...


In [24]:
consensus_data_seqs.to_csv("./data_software_output/consensus_data_seqs.csv", sep=",", index=False, header=True)