Needed libraries

In [32]:
import numpy as np
import pandas as pd
import subprocess
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Needed functions

In [33]:
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)
    
def blastn_blaster(query, db, perc_indentity):
    cmd = "blastn -word_size 11 " \
    + " -query " + query \
    + " -db "  + db \
    + " -perc_identity " + str(perc_indentity) \
    + " -outfmt '10 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])

    data.columns = ["qseqid", "sseqid", "pident", "length", "qlen", "slen", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp", "sstrand", "sseq"]

    return data

# Get fasta f# Define fasta_creator file from the first chromosome
def fasta_creator(data, fasta_output_path):
    matrix = []
    for index, sequence in data.iterrows():
        rec = SeqRecord(Seq(sequence[5]),  # In the 5 position is the seq
                        id=f"Seq_{index}_{sequence[0]}",
                        description="Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

Import data

In [16]:
data = pd.read_csv("./dict/positives_testing_elements.csv", header=None)
print(data.shape)
data.head()

(1951, 6)


Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
3,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...


Group by chromosomes in column 0

In [17]:
data_grouped = data.groupby(0)

In [35]:
def save_sequences_to_csv_pandas(data, filename):
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(data)
    # Save the DataFrame to a CSV file
    df.to_csv(filename, header=False, index=False)

In [39]:
for index, group in data_grouped:
    cmd = ["mkdir", "-p", f"./dict/chromosomes/{index}"]
    subprocess.run(cmd)  # creating a folder for each chromosome
    print(f"Chromosome {index} folder created")

    fasta_creator(group, f"./dict/chromosomes/{index}/positives_{index}.fasta")  # creating a for each chromosome
    blastn_dic(f"./dict/chromosomes/{index}/positives_{index}.fasta")  # creating a BLASTN dict for each chromosome

    data = blastn_blaster(f"./dict/chromosomes/{index}/positives_{index}.fasta", f"./dict/chromosomes/{index}/positives_{index}.fasta", 85)  # BLASTN for each chromosome
    data = data[data["length"].astype(int) > 100]  # filtering by length
    # data = data.query("qseqid != sseqid")  # filtering by qseqid != sseqid

    dict = {}
    for seq in data["qseqid"].unique():
        values = data[data["qseqid"] == seq].loc[:, ["sseqid"]].values.flatten().tolist()
        values = sorted(values)
        dict[seq] = values
    dict = {key: sorted(value, key=lambda x: int(x.split('_')[1])) for key, value in dict.items()}   # Sort the values by the number of the sequence

    dataset = []
    for key, value in dict.items():
        if value in dataset:
            continue  # If it's in the dataset, it doesn't do anything.
        else:
            dataset.append(value)  # If it's not in the dataset, it appends it.
    
    save_sequences_to_csv_pandas(dataset, f"./dict/chromosomes/{index}/families_{index}.csv")  # saving the dataset



Chromosome LinJ.01 folder created


Building a new DB, current time: 05/03/2024 14:35:53
New DB name:   /home/rfpacheco/Documents/CBM_Work/Testing_Leishmania_project/9.subfamilies_global/dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
New DB title:  ./dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 12 sequences in 0.000677109 seconds.



Blast Dictionary created in ./dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
Chromosome LinJ.02 folder created


Building a new DB, current time: 05/03/2024 14:35:54
New DB name:   /home/rfpacheco/Documents/CBM_Work/Testing_Leishmania_project/9.subfamilies_global/dict/chromosomes/LinJ.02/positives_LinJ.02.fasta
New DB title:  ./dict/chromosomes/LinJ.02/positives_LinJ.02.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 17 sequences in 0.00135207 seconds.



Blast Dictionary created i

In [36]:
test = [['Seq_1'],
 ['Seq_2', 'Seq_3', 'Seq_5', 'Seq_6', 'Seq_8', 'Seq_9', 'Seq_11'],
 ['Seq_4', 'Seq_7'],
 ['Seq_10'],
 ['Seq_12']]

In [37]:
test

[['Seq_1'],
 ['Seq_2', 'Seq_3', 'Seq_5', 'Seq_6', 'Seq_8', 'Seq_9', 'Seq_11'],
 ['Seq_4', 'Seq_7'],
 ['Seq_10'],
 ['Seq_12']]

In [38]:
save_sequences_to_csv_pandas(test, "test.csv")