Needed libraries

In [7]:
import numpy as np
import pandas as pd
import subprocess
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Needed functions

In [33]:
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)
    
def blastn_blaster(query, db, perc_indentity):
    cmd = "blastn -word_size 11 " \
    + " -query " + query \
    + " -db "  + db \
    + " -perc_identity " + str(perc_indentity) \
    + " -outfmt '10 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])

    data.columns = ["qseqid", "sseqid", "pident", "length", "qlen", "slen", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp", "sstrand", "sseq"]

    return data

# Get fasta f# Define fasta_creator file from the first chromosome
def fasta_creator(data, fasta_output_path):
    matrix = []
    for index, sequence in data.iterrows():
        rec = SeqRecord(Seq(sequence[5]),  # In the 5 position is the seq
                        id=f"Seq_{index}_{sequence[0]}",
                        description="Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

Import data

In [16]:
data = pd.read_csv("./dict/positives_testing_elements.csv", header=None)
print(data.shape)
data.head()

(1951, 6)


Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
3,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...


Group by chromosomes in column 0

In [17]:
data_grouped = data.groupby(0)

In [65]:
def save_sequences_to_csv_pandas(data, filename):
    # Convert the list of lists to a DataFrame
    df = pd.DataFrame(data)
    # Save the DataFrame to a CSV file
    df.to_csv(filename, header=False, index=False)

def find_maximal_sets(sequences):
    # Identify maximal sets that are not subsets of any other set in the list
    maximal_sets = []
    for i, seq in enumerate(sequences):
        is_subset = False
        for j, other_seq in enumerate(sequences):
            # print(f"For index {i} and {j}, sequences are {seq} and {other_seq}")
            if i != j and set(seq).issubset(set(other_seq)):
                # print("\tChecking if it's a subset")
                is_subset = True
                break
        if not is_subset:
            # print(f"\tAdding {seq} to maximal_sets")
            maximal_sets.append(seq)
    return maximal_sets

def count_sequences(sequences):
    from collections import Counter
    # Flatten the list of sequences
    flat_list = [seq for sublist in sequences for seq in sublist]
    # Count each sequence using Counter
    sequence_counts = Counter(flat_list)
    # Identify sequences that appear more than once
    multiple_occurrences = {seq: count for seq, count in sequence_counts.items() if count > 1}
    return multiple_occurrences

In [82]:
repeated_chr = []
for index, group in data_grouped:
    cmd = ["mkdir", "-p", f"./dict/chromosomes/{index}"]
    subprocess.run(cmd)  # creating a folder for each chromosome
    print(f"Chromosome {index} folder created")

    fasta_creator(group, f"./dict/chromosomes/{index}/positives_{index}.fasta")  # creating a for each chromosome
    blastn_dic(f"./dict/chromosomes/{index}/positives_{index}.fasta")  # creating a BLASTN dict for each chromosome

    data = blastn_blaster(f"./dict/chromosomes/{index}/positives_{index}.fasta", f"./dict/chromosomes/{index}/positives_{index}.fasta", 85)  # BLASTN for each chromosome
    data = data[data["length"].astype(int) > 100]  # filtering by length
    # data = data.query("qseqid != sseqid")  # filtering by qseqid != sseqid

    dict = {}
    for seq in data["qseqid"].unique():
        values = data[data["qseqid"] == seq].loc[:, ["sseqid"]].values.flatten().tolist()
        values = sorted(values)
        dict[seq] = values
    dict = {key: sorted(value, key=lambda x: int(x.split('_')[1])) for key, value in dict.items()}   # Sort the values by the number of the sequence

    dataset = []
    for key, value in dict.items():
        if value in dataset:
            continue  # If it's in the dataset, it doesn't do anything.
        else:
            dataset.append(value)  # If it's not in the dataset, it appends it.
    
    dataset = find_maximal_sets(dataset)  # Find the maximal sets

    counters = count_sequences(dataset)  # Count the sequences if there is >1
    if len(counters) > 0:
        repeated_chr.append(index)
        dataset.append("Repeated values:")
        combined_list = [item for pair in zip(counters.keys(), counters.values()) for item in pair]
        dataset.append(combined_list)  # Append the sequences that appear more than once

    
    save_sequences_to_csv_pandas(dataset, f"./dict/chromosomes/{index}/families_{index}.csv")  # saving the dataset

Chromosome LinJ.01 folder created


Building a new DB, current time: 05/03/2024 15:19:36
New DB name:   /home/rfpacheco/Documents/CBM_Work/Testing_Leishmania_project/9.subfamilies_global/dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
New DB title:  ./dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Documents/CBM_Work/Testing_Leishmania_project/9.subfamilies_global/dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 12 sequences in 0.00178313 seconds.



Blast Dictionary created in ./dict/chromosomes/LinJ.01/positives_LinJ.01.fasta
Chromosome LinJ.02 folder created


Building a new DB, current time: 05/03/2024 15:19:37
New DB name:   /home/rfpacheco/Documents/CBM_Work/Testing_Leishmania_project/9.subfamilies_global/dict/chromosomes/LinJ.02/positives_LinJ.02.fasta
New DB title:  ./dict/chromosomes/LinJ.02/positives_LinJ.

In [85]:
print(len(repeated_chr))
repeated_chr

13


['LinJ.18',
 'LinJ.19',
 'LinJ.21',
 'LinJ.22',
 'LinJ.23',
 'LinJ.24',
 'LinJ.29',
 'LinJ.30',
 'LinJ.31',
 'LinJ.33',
 'LinJ.34',
 'LinJ.35',
 'LinJ.36']

In [49]:
# test1 = [807, 808, 809]
test1 = [10, 20, 30]
test2 = [807, 809, 810]

In [50]:
test1_series = pd.Series(test1)
test2_series = pd.Series(test2)

test1_series.apply(lambda x: x in test2)
# test1.apply(lambda x: x in test2)

0    False
1    False
2    False
dtype: bool

In [51]:
test1_series.apply(lambda x: x in test2)[1]

False

In [53]:
test1_series.apply(lambda x: x in test2).value_counts().iloc[0]


3

In [58]:
try :
    test1_series.apply(lambda x: x in test2).value_counts().loc[True]
    print("True")
except:
    print("False")  


False


In [11]:
print(test1_series, test2_series, sep="\n")

0    807
1    808
2    809
dtype: int64
0    807
1    809
2    810
dtype: int64


In [37]:
dict_test = {}

In [42]:
dict_test["test1"] = test1
dict_test

{'test1': [807, 808, 809]}

In [43]:
dict_test["test1"].append(10)

In [44]:
dict_test

{'test1': [807, 808, 809, 10]}

In [45]:
dict_test2 = {}

In [48]:
dict_test2["test1"] = []
dict_test2["test1"].append(1231)
dict_test2

{'test1': [1231]}

In [59]:
dict_test2["test1"]

[1231]

In [7]:
is_subset = False
if not is_subset:
    print("1")
else:
    print("2")

1
