In [2]:
# Needed libraries
import pandas as pd
import numpy as np
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [40]:
# If needed, change the working directory
%%script false --no-raise-error
os.chdir("/home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Filter_Test_1/chr1_analysis")

In [3]:
# Just printing the current working directory
os.getcwd()

'/home/viskuit/Desktop/Projects/Leishmania/Testing_Leishmania_project/Filter_Test_1/chr1_analysis'

In [41]:
path = "../filtered_data.csv"  # data to CSV file path
genome_path = "../../Data/genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"  # data to genome fasta file

Need to create a *fasta* file with all the filtered data

In [42]:
data = pd.read_csv(path, sep=",", header=None)  # read the data
print(data.shape)
data.head()

(6012, 6)


Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,666,24093,24758,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,LinJ.01,618,35316,35933,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
2,LinJ.01,884,39698,40581,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
3,LinJ.01,645,54885,55529,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...
4,LinJ.01,868,75659,76526,plus,GAGCGCCGCTGAGCAGGCAAGCGAGGCAACCTACGAAAACATGGCA...


In [43]:
data_sub1 = data[data[0] == "LinJ.01"] # Subset chromosome 1
print(data_sub1.shape)
data_sub1.head()

(33, 6)


Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,666,24093,24758,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,LinJ.01,618,35316,35933,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
2,LinJ.01,884,39698,40581,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
3,LinJ.01,645,54885,55529,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...
4,LinJ.01,868,75659,76526,plus,GAGCGCCGCTGAGCAGGCAAGCGAGGCAACCTACGAAAACATGGCA...


In [44]:
# Let's define  the fasta creator function
def fasta_creator(data_df, fasta_output_path):

    fasta_df = []
    for index, row in data_df.iterrows():
        rec = SeqRecord(
            Seq(row.iloc[5]),
            id="Seq_" + str(index) + "_" + row.iloc[0] + "_" + row.iloc[4],           
            description="Leishmania infantum " + row.iloc[4]
            )
        fasta_df.append(rec)

    SeqIO.write(fasta_df, fasta_output_path, "fasta")
    print("\nFasta created at:", fasta_output_path)

In [45]:
# Let's create the fasta file for all the data in chr1
fasta_creator(data_sub1, "./blastn_dict/LinJ01_SIDERs.fasta")


Fasta created at: LinJ01_SIDERs.fasta


In [46]:
# Let's define the BLASTn dictionary function
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)

In [47]:
# Launch BLASTn dictionary function
blastn_dic("./blastn_dict/LinJ01_SIDERs.fasta")



Building a new DB, current time: 04/03/2024 11:09:08
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Filter_Test_1/chr1_analysis/LinJ01_SIDERs.fasta
New DB title:  ./LinJ01_SIDERs.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Filter_Test_1/chr1_analysis/LinJ01_SIDERs.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 33 sequences in 0.00799203 seconds.



Blast Dictionary created in ./LinJ01_SIDERs.fasta


In [48]:
# Create the main BLASTn function:
def blastn_blaster(query_path, dict_path, outfile_path, perc_identity):
    print("BLASTn searcher initiated")
    os.system("blastn -word_size 11 -query "
                + query_path + " -db "
                + dict_path + " -out "
                + outfile_path + " -perc_identity "
                + str(perc_identity) + " -outfmt '10 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp sstrand sseq'")
    print("\nBlaster successful", outfile_path, "created.")


In [34]:
blastn_blaster("./blastn_dict/LinJ01_SIDERs.fasta", "./blastn_dict/LinJ01_SIDERs.fasta", "./results/output.csv", 85)

BLASTn searcher initiated



Blaster successful output.csv created.


In [54]:
# Filter alignment length > 100
blast_data = pd.read_csv("./results/output.csv", sep=",", header=None)  # read data
print(f"Shape before filtering: {blast_data.shape}")  # print shape
print("Number of unique values in column 0 before filtering:", blast_data[0].nunique())  # print unique values in column 0

# Now filter it
print("="*50)
blast_data = blast_data[blast_data[3] > 100]  # filter alignment length > 100
print(f"Shape after filtering: {blast_data.shape}")  # print shape
print("Number of unique values in column 0 after filtering:", blast_data[0].nunique())  # print unique values in column 0
blast_data.head()
blast_data.to_csv("./results/output2.csv", sep=",", header=None, index=None)  # Save data with alignment length > 100
blast_data.head()

Shape before filtering: (516, 17)
Number of unique values in column 0 before filtering: 33
Shape after filtering: (335, 17)
Number of unique values in column 0 after filtering: 33


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Seq_0_LinJ.01_plus,Seq_0_LinJ.01_plus,100.0,666,666,666,0,0,1,666,1,666,0.0,1230.0,100,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,Seq_0_LinJ.01_plus,Seq_3_LinJ.01_plus,99.534,644,666,645,3,0,23,666,644,1,0.0,1173.0,97,minus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...
2,Seq_0_LinJ.01_plus,Seq_1_LinJ.01_plus,99.192,619,666,618,3,2,7,624,618,1,0.0,1114.0,93,minus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...
3,Seq_0_LinJ.01_plus,Seq_2669_LinJ.01_minus,100.0,560,666,593,0,0,107,666,593,34,0.0,1035.0,84,minus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
5,Seq_0_LinJ.01_plus,Seq_2670_LinJ.01_minus,99.462,558,666,787,2,1,7,563,228,785,0.0,1013.0,84,plus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...


In [56]:
# Filter by qcovhsp
blast_data = blast_data[blast_data[14] >= 80]
blast_data.to_csv("./results/output3.csv", sep=",", header=None, index=None)
print(blast_data.shape)
blast_data.head()

(167, 17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Seq_0_LinJ.01_plus,Seq_0_LinJ.01_plus,100.0,666,666,666,0,0,1,666,1,666,0.0,1230.0,100,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,Seq_0_LinJ.01_plus,Seq_3_LinJ.01_plus,99.534,644,666,645,3,0,23,666,644,1,0.0,1173.0,97,minus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...
2,Seq_0_LinJ.01_plus,Seq_1_LinJ.01_plus,99.192,619,666,618,3,2,7,624,618,1,0.0,1114.0,93,minus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...
3,Seq_0_LinJ.01_plus,Seq_2669_LinJ.01_minus,100.0,560,666,593,0,0,107,666,593,34,0.0,1035.0,84,minus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
5,Seq_0_LinJ.01_plus,Seq_2670_LinJ.01_minus,99.462,558,666,787,2,1,7,563,228,785,0.0,1013.0,84,plus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...


In [78]:
%%script false --no-raise-error
# To save session variables
import dill

# Saving the session
with open('session.pkl', 'wb') as f:
    dill.dump_session(f)

# And loading it back later
with open('session.pkl', 'rb') as f:
    dill.load_session(f)