In [43]:
import pandas as pd
import numpy as np
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [45]:
os.chdir("/home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Filter_Test_1/chr2_analysis")


In [36]:
path = "../filtered_data.csv"
genome_path = "../../Data/genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"

Need to create a *fasta* file with all the filtered data

In [37]:
data = pd.read_csv(path, sep=",", header=None)
print(data.shape)
data.head()

(6012, 6)


Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,666,24093,24758,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,LinJ.01,618,35316,35933,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
2,LinJ.01,884,39698,40581,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
3,LinJ.01,645,54885,55529,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...
4,LinJ.01,868,75659,76526,plus,GAGCGCCGCTGAGCAGGCAAGCGAGGCAACCTACGAAAACATGGCA...


In [38]:
# Subset chromosome 1
data_sub1 = data[data[0] == "LinJ.02"]
print(data_sub1.shape)
data_sub1.head()

(57, 6)


Unnamed: 0,0,1,2,3,4,5
21,LinJ.02,436,16511,16946,plus,CCGGGGGGGTGCGGAGGACACACATACGCCTCAGGGCGCGGTATCG...
22,LinJ.02,456,27338,27793,plus,GACACACATACGCCTCAGGGCGCGGTATCGCAGGGCCCGGTGCACT...
23,LinJ.02,478,73790,74267,plus,TCGGTACCCCAAAACGTGGTCGTGCTCCGCACGCGAGCCGCGCAAT...
24,LinJ.02,1000,89374,90373,plus,CAAGAGGCGGAGAAAGAGCTGGATGCCGATGTGCAGGGAATGGATA...
25,LinJ.02,1104,94261,95364,plus,AGCCTCAGGCCAGGGCCCGCTGAGACCCTGCACATCCTGTCTGTGT...


In [39]:
# Let's define  the fasta function
def fasta_creator(data_df, fasta_output_path):

    fasta_df = []
    for index, row in data_df.iterrows():
        rec = SeqRecord(
            Seq(row.iloc[5]),
            id="Seq_" + str(index) + "_" + row.iloc[0] + "_" + row.iloc[4],           
            description="Leishmania infantum " + row.iloc[4]
            )
        fasta_df.append(rec)

    SeqIO.write(fasta_df, fasta_output_path, "fasta")
    print("\nFasta created at:", fasta_output_path)

In [46]:
# Let's create the fasta file
fasta_creator(data_sub1, "LinJ02_SIDERs.fasta")


Fasta created at: LinJ02_SIDERs.fasta


In [47]:
# Let's define the BLASTn dictionary function
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)

In [48]:
# Launch BLASTn dictionary function
blastn_dic("./LinJ02_SIDERs.fasta")



Building a new DB, current time: 04/02/2024 16:36:46
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Filter_Test_1/chr2_analysis/LinJ02_SIDERs.fasta
New DB title:  ./LinJ02_SIDERs.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 57 sequences in 0.00146389 seconds.



Blast Dictionary created in ./LinJ02_SIDERs.fasta


In [49]:
# Create the main BLASTn function:
def blastn_blaster(query_path, dict_path, outfile_path, perc_identity):
    print("BLASTn searcher initiated")
    os.system("blastn -word_size 11 -query "
                + query_path + " -db "
                + dict_path + " -out "
                + outfile_path + " -perc_identity "
                + str(perc_identity) + " -outfmt '10 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp sstrand sseq'")
    print("\nBlaster successful", outfile_path, "created.")


In [50]:
blastn_blaster("LinJ01_SIDERs.fasta", "./LinJ02_SIDERs.fasta", "output.csv", 85)

BLASTn searcher initiated

Blaster successful output.csv created.


In [29]:
# Filter alignment length > 100
blast_data = pd.read_csv("output.csv", sep=",", header=None)
print(f"Shape before filtering: {blast_data.shape}")
print("Number of unique values in column 0 before filtering:", blast_data[0].nunique())

# Now filter it
print("="*50)
blast_data = blast_data[blast_data[3] > 100]
print(f"Shape after filtering: {blast_data.shape}")
print("Number of unique values in column 0 after filtering:", blast_data[0].nunique())
blast_data.head()
blast_data.to_csv("output2.csv", sep=",", header=None, index=None)


Shape before filtering: (516, 17)
Number of unique values in column 0 before filtering: 33
Shape after filtering: (335, 17)
Number of unique values in column 0 after filtering: 33


In [31]:
blast_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Seq_0_LinJ.01_plus,Seq_0_LinJ.01_plus,100.0,666,666,666,0,0,1,666,1,666,0.0,1230.0,100,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,Seq_0_LinJ.01_plus,Seq_3_LinJ.01_plus,99.534,644,666,645,3,0,23,666,644,1,0.0,1173.0,97,minus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...
2,Seq_0_LinJ.01_plus,Seq_1_LinJ.01_plus,99.192,619,666,618,3,2,7,624,618,1,0.0,1114.0,93,minus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...
3,Seq_0_LinJ.01_plus,Seq_2669_LinJ.01_minus,100.0,560,666,593,0,0,107,666,593,34,0.0,1035.0,84,minus,CCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCT...
5,Seq_0_LinJ.01_plus,Seq_2670_LinJ.01_minus,99.462,558,666,787,2,1,7,563,228,785,0.0,1013.0,84,plus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...


In [33]:
blast_data_test = blast_data[blast_data[14] >= 80]
blast_data_test.to_csv("output_test.csv", sep=",", header=None, index=None)

In [19]:
catcher = []
groups = []
for index, row in blast_data.iterrows():
    if row[0] not in groups:
        catcher.append(row[0])
        groups.append([row[0]])  # Created group
    

