In [5]:
import numpy as np
import pandas as pd
import subprocess
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [6]:
data_positive = pd.read_csv("./positives_testing_elements.csv", sep="," , header=None)
data_positive.head()

Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
3,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...


In [8]:
# Define fasta_creator file
def fasta_creator(data, fasta_output_path):
    matrix = []
    for index, sequence in data.iterrows():
        # index += 1 # To start the index in 1
        rec = SeqRecord(Seq(sequence[5]),  # In the 5 position is the seq
                        id="Seq_" + str(index),
                        description="Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

In [7]:
 # Create fasta
fasta_creator(data_positive, "./blaster_pos/positives.fasta")

Now let's launch the BLASTN 

Let's prepare the "ingi" data in fasta

In [21]:
ingi = "CCCTGGCGATGCCGGCCACCTCAACGTGGTGCCAGGGTCCAGTACCCCGTATCATCGGGGGAAGCCAAGAGCCAGCAGC"
len(ingi)

79

In [6]:
# Prepare functions
# Let's define the BLASTn dictionary function
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)

#And the blaster
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt 10"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    return data

In [62]:
# Let's make the dict
blastn_dic("./blaster_pos/positives.fasta")



Building a new DB, current time: 04/17/2024 13:57:12
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/BEDOPS_join_strands/filtered_data/blaster/positives.fasta
New DB title:  ./blaster/positives.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/BEDOPS_join_strands/filtered_data/blaster/positives.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1951 sequences in 0.137699 seconds.



Blast Dictionary created in ./blaster/positives.fasta


In [7]:
# Let's make a fasta only for ingi:
def fasta_creator2(sequence, index, fasta_output_path, description_name):
    rec = SeqRecord(Seq(sequence),
                    id="ingi_" + str(index),
                    description=description_name
                    )
    SeqIO.write(rec, fasta_output_path, "fasta")

In [64]:
fasta_creator2(ingi, 1, "./blaster_pos/ingi.fasta", "Trypanosoma brucei ingi element")

In [9]:
# Launch ingi element vs my positives
blastn_data = blastn_blaster("./blaster_pos/ingi.fasta", "./blaster_pos/positives.fasta", 0)

In [5]:
blastn_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ingi_1,Seq_1028,83.333,60,6,4,17,73,529,471,1.04e-08,52.8
1,ingi_1,Seq_1948,86.957,46,3,3,1,44,17,61,1.35e-07,49.1
2,ingi_1,Seq_1943,86.957,46,3,3,1,44,17,61,1.35e-07,49.1
3,ingi_1,Seq_701,87.805,41,5,0,2,42,792,832,1.35e-07,49.1
4,ingi_1,Seq_688,87.805,41,5,0,2,42,706,666,1.35e-07,49.1


In [10]:
print(f"Total elements in the negative list: {len(data_positive)}"
      f"\nTotal elements in the negative list with ingi: {blastn_data[1].nunique()}",
      f"\nPercentage of elements with ingi: {blastn_data[1].nunique()/len(data_positive)*100:.2f}%"
      )

Total elements in the negative list: 1951
Total elements in the negative list with ingi: 204 
Percentage of elements with ingi: 10.46%


In [7]:
blastn_data[1].nunique()

204

In [17]:
print(round(blastn_data[1].nunique() / data_positive.shape[0] * 100, 2), "%")

10.46 %


## Now let's make the comparison with the hallmark from LmSIDERa and LmSIDERb from Bringaud (2007)

In [24]:
# Let's first prepare the Hallmark elements
LmSIDER2a = "CCCTGATGACGAGGGACACCTCAGCGTGGTATCAGGGTCCAGTACACCCCACTCTGTGAGGAAGCCGAGCAGCTCCCTC"
LmSIDER2b = "CCCTGCCAATGCCGAACCACTTCTGGTGGTGACAGGGTCCAGTGCCTACTACGTAGGGGAGGTCAGAGCGATGCATC"
print(f"LmSIDER2a Hallmark is {len(LmSIDER2a)} nt long",
      f"\nLmSIDER2b Hallmark is {len(LmSIDER2b)} nt long"
      )

LmSIDER2a Hallmark is 79 nt long 
LmSIDER2b Hallmark is 77 nt long


In [38]:
# Make a fasta for the Hallmarks of LmSIDER2
fasta_creator2(LmSIDER2a, 1, "./blaster_pos/LmSIDER2a.fasta", "LmSIDER2a hallmark")
fasta_creator2(LmSIDER2b, 1, "./blaster_pos/LmSIDER2b.fasta", "LmSIDER2b hallmark")

In [12]:
# Now let's make the blasters first with 60 identity
LmSIDER2a_data60 = blastn_blaster("./blaster_pos/LmSIDER2a.fasta", "./blaster_pos/positives.fasta", 60)
LmSIDER2b_data60 = blastn_blaster("./blaster_pos/LmSIDER2b.fasta", "./blaster_pos/positives.fasta", 60)
print(f"Total elements with LmSIDER2a in the positive list: {len(LmSIDER2a_data60)}",
      f"\n\tUnique elements: {LmSIDER2a_data60[1].nunique()}",
      f"\nTotal elements with LmSIDER2b in the positive list: {len(LmSIDER2b_data60)}",
      f"\n\tUnique elements: {LmSIDER2b_data60[1].nunique()}"
      )


Total elements with LmSIDER2a in the positive list: 339 
	Unique elements: 331 
Total elements with LmSIDER2b in the positive list: 538 
	Unique elements: 500


In [15]:
LmSIDER2a_data60_unique = LmSIDER2a_data60[LmSIDER2a_data60[1].duplicated() == False]; print(LmSIDER2a_data60_unique.shape)
LmSIDER2b_data60_unique = LmSIDER2b_data60[LmSIDER2b_data60[1].duplicated() == False]; print(LmSIDER2b_data60_unique.shape)

(331, 12)
(500, 12)


In [41]:
# Let's get the unique elements
LmSIDER2a_list = LmSIDER2a_data60_unique.iloc[:,1].values.flatten().tolist()
LmSIDER2a_list = sorted(LmSIDER2a_list, key=lambda x: int(x.split("_")[1]))

LmSIDER2b_list = LmSIDER2b_data60_unique.iloc[:,1].values.flatten().tolist()
LmSIDER2b_list = sorted(LmSIDER2b_list, key=lambda x: int(x.split("_")[1]))


In [44]:
print(len(LmSIDER2a_list), LmSIDER2a_list, sep="\n")

331
['Seq_6', 'Seq_40', 'Seq_46', 'Seq_47', 'Seq_56', 'Seq_63', 'Seq_64', 'Seq_66', 'Seq_68', 'Seq_69', 'Seq_70', 'Seq_81', 'Seq_82', 'Seq_88', 'Seq_115', 'Seq_116', 'Seq_125', 'Seq_145', 'Seq_165', 'Seq_180', 'Seq_184', 'Seq_198', 'Seq_199', 'Seq_200', 'Seq_201', 'Seq_203', 'Seq_217', 'Seq_218', 'Seq_219', 'Seq_220', 'Seq_245', 'Seq_246', 'Seq_250', 'Seq_251', 'Seq_252', 'Seq_268', 'Seq_269', 'Seq_272', 'Seq_300', 'Seq_316', 'Seq_322', 'Seq_339', 'Seq_340', 'Seq_341', 'Seq_342', 'Seq_343', 'Seq_344', 'Seq_346', 'Seq_351', 'Seq_355', 'Seq_362', 'Seq_380', 'Seq_394', 'Seq_419', 'Seq_439', 'Seq_440', 'Seq_441', 'Seq_442', 'Seq_443', 'Seq_444', 'Seq_445', 'Seq_446', 'Seq_447', 'Seq_474', 'Seq_479', 'Seq_482', 'Seq_497', 'Seq_498', 'Seq_521', 'Seq_522', 'Seq_523', 'Seq_524', 'Seq_531', 'Seq_534', 'Seq_544', 'Seq_546', 'Seq_547', 'Seq_553', 'Seq_554', 'Seq_558', 'Seq_559', 'Seq_560', 'Seq_561', 'Seq_585', 'Seq_592', 'Seq_594', 'Seq_595', 'Seq_597', 'Seq_598', 'Seq_627', 'Seq_631', 'Seq_632'

In [45]:
print(len(LmSIDER2b_list), LmSIDER2b_list, sep="\n")

500
['Seq_2', 'Seq_3', 'Seq_5', 'Seq_6', 'Seq_27', 'Seq_29', 'Seq_31', 'Seq_39', 'Seq_40', 'Seq_50', 'Seq_51', 'Seq_52', 'Seq_53', 'Seq_54', 'Seq_68', 'Seq_70', 'Seq_77', 'Seq_104', 'Seq_143', 'Seq_144', 'Seq_149', 'Seq_150', 'Seq_151', 'Seq_152', 'Seq_157', 'Seq_174', 'Seq_184', 'Seq_185', 'Seq_186', 'Seq_190', 'Seq_198', 'Seq_199', 'Seq_200', 'Seq_201', 'Seq_203', 'Seq_217', 'Seq_218', 'Seq_219', 'Seq_220', 'Seq_222', 'Seq_223', 'Seq_229', 'Seq_230', 'Seq_231', 'Seq_232', 'Seq_258', 'Seq_263', 'Seq_264', 'Seq_268', 'Seq_269', 'Seq_272', 'Seq_286', 'Seq_300', 'Seq_306', 'Seq_308', 'Seq_310', 'Seq_311', 'Seq_312', 'Seq_313', 'Seq_316', 'Seq_317', 'Seq_318', 'Seq_319', 'Seq_321', 'Seq_325', 'Seq_326', 'Seq_328', 'Seq_329', 'Seq_330', 'Seq_331', 'Seq_332', 'Seq_333', 'Seq_334', 'Seq_335', 'Seq_336', 'Seq_337', 'Seq_356', 'Seq_357', 'Seq_358', 'Seq_359', 'Seq_360', 'Seq_362', 'Seq_363', 'Seq_367', 'Seq_369', 'Seq_371', 'Seq_372', 'Seq_385', 'Seq_386', 'Seq_388', 'Seq_392', 'Seq_393', 'Seq

In [52]:
# Now let's count elements in LmSIDER2a which are in LmSIDER2b
LmSIDER2a_in_LmSIDER2b = [x for x in LmSIDER2a_list if x in LmSIDER2b_list]  # Lements in both lists
LmSIDER2a_notin_LMSIDER2b = [x for x in LmSIDER2a_list if x not in LmSIDER2b_list]  # Elements only in LmSIDER2a
LmSIDER2b_notin_LMSIDER2a = [x for x in LmSIDER2b_list if x not in LmSIDER2a_list]  # Elements only in LmSIDER2b
Total_elements = len(LmSIDER2a_in_LmSIDER2b) + len(LmSIDER2a_notin_LMSIDER2b) + len(LmSIDER2b_notin_LMSIDER2a)
print(f"There are {len(LmSIDER2a_in_LmSIDER2b)} elements in both lists",
      f"\n\tPercent of all elements: {len(LmSIDER2a_in_LmSIDER2b)/data_positive.shape[0]*100:.2f}%")
print(f"There are {len(LmSIDER2a_notin_LMSIDER2b)} elements only in LmSIDER2a",
      f"\n\tPercent of all elements: {len(LmSIDER2a_notin_LMSIDER2b)/data_positive.shape[0]*100:.2f}%")
print(f"There are {len(LmSIDER2b_notin_LMSIDER2a)} elements only in LmSIDER2b",
      f"\n\tPercent of all elements: {len(LmSIDER2b_notin_LMSIDER2a)/data_positive.shape[0]*100:.2f}%")
print(f"Total elements: {Total_elements}",
      f"\n\tPercent of all elements: {Total_elements/data_positive.shape[0]*100:.2f}%")

There are 145 elements in both lists 
	Percent of all elements: 7.43%
There are 186 elements only in LmSIDER2a 
	Percent of all elements: 9.53%
There are 355 elements only in LmSIDER2b 
	Percent of all elements: 18.20%
Total elements: 686 
	Percent of all elements: 35.16%


In [55]:
## The same but with 0 identity
LmSIDER2a_data0 = blastn_blaster("./blaster_pos/LmSIDER2a.fasta", "./blaster_pos/positives.fasta", 0)
LmSIDER2b_data0 = blastn_blaster("./blaster_pos/LmSIDER2b.fasta", "./blaster_pos/positives.fasta", 0)
print(f"Total elements with LmSIDER2a in the positive list: {len(LmSIDER2a_data0)}",
      f"\n\tUnique elements: {LmSIDER2a_data0[1].nunique()}",
      f"\nTotal elements with LmSIDER2b in the positive list: {len(LmSIDER2b_data0)}",
      f"\n\tUnique elements: {LmSIDER2b_data0[1].nunique()}"
      )

Total elements with LmSIDER2a in the positive list: 339 
	Unique elements: 331 
Total elements with LmSIDER2b in the positive list: 538 
	Unique elements: 500


You can see it's exactly the same as before