# Code

## Needed modules

In [26]:
# Needed modules
import os
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Load data

In [27]:
# Load al the data
df_main = pd.read_csv("./data/consensus+LmSIDER2A+B/AL_DATA.csv", sep=",", header=0)
print(df_main.shape)
print(df_main.dtypes)
df_main.head()

(2092, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [28]:
# Create a column with the length of the sseq
df_main["length"] = df_main["sseq"].apply(lambda x: len(x))

In [29]:
# Check df
df_main.head()

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...,806
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927


In [30]:
# check dtypes
df_main.dtypes

sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
length      int64
dtype: object

In [31]:
# describe length column
df_main["length"].describe()

count     2092.000000
mean       827.937859
std        467.971245
min        101.000000
25%        649.000000
50%        770.000000
75%        921.000000
max      10546.000000
Name: length, dtype: float64

In [32]:
# Le'ts check the length of the elements above the third quartile
long_seqs = df_main[df_main["length"] > int(df_main["length"].describe()["75%"])].copy()
long_seqs

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
10,LinJ.01,145322,146653,plus,ATCCCCACGGGGTGGTGAGGCGAGAAGTCAGGGGTCGGGCACGCGC...,1332
...,...,...,...,...,...,...
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
2075,LinJ.36,2504116,2505057,plus,TGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCTGCC...,942
2076,LinJ.36,2533606,2534550,plus,CAACGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCT...,945
2081,LinJ.36,2600817,2601884,plus,GCGCATGCCGAGCACCGCTGGCATGTGGTGTGCCGCATCCGACCGA...,1068


In [33]:
long_seqs_values = pd.DataFrame(long_seqs["length"].value_counts(sort=False))

In [34]:
long_seqs_values.sort_index(inplace=True, ascending=False)
long_seqs_values

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
10546,1
5635,1
4239,1
4160,1
4096,1
...,...
927,5
926,2
924,1
923,3


Now I have two values:
- `df_main` ==> with the 2092 elements
- `long_seqs`==> subset of  `df_main` with the elements with a length >Q3

## Main code

### Prepare functions

In [35]:
# Prepare functions
# Needed functions
def fasta_creator(csv_input, output_path):
    matrix = []
    for index, row in csv_input.iterrows():
        rec = SeqRecord(Seq(row["sseq"]), 
                        id = f"Seq_{index}_{row['sseqid']}",
                        description = "Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, output_path, "fasta")

def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def blastn_blaster(query_path, dict_path):
    cmd = "blastn -word_size 11" \
        + " -query " + query_path \
        + " -db " + dict_path \
        + " -outfmt '10 qseqid sseqid sstrand pident qstart qend sstart send evalue bitscore length qlen qcovs slen mismatch gapopen gaps'"
    data = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable='/usr/bin/bash')  # Important the E value
    data = data.stdout
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    if not data.empty:  # If the dataframe is not empty
        data.columns = ["qseqid", "sseqid", "sstrand", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen", "mismatch", "gapopen", "gaps"]
        data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen', 'mismatch', 'gapopen', 'gaps']] = data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen', 'mismatch', 'gapopen', 'gaps']].apply(pd.to_numeric)
    else:  # If the dataframe is empty
        data = pd.DataFrame(columns=["qseqid", "sseqid", "sstrand", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen", "mismatch", "gapopen", "gaps"])  # Create an empty dataframe
    return data  

### Prepare paths

In [37]:
# Prepare paths
path_genome = "../0.Data/genome/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"

### Prepare blastn dicts

In [None]:
blastn_dic(path_input=path_genome, path_output=path_genome)

### Test normal sequence

In [41]:
df_main.iloc[1,:]

sseqid                                               LinJ.01
sstart                                                 24093
send                                                   25080
sstrand                                                 plus
sseq       GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
length                                                   988
Name: 1, dtype: object

In [49]:
test_seq_1 = {}
test_seq_1[f"Seq_{df_main.loc[1,'sseqid']}_{df_main.loc[1,'sstrand']}_{df_main.loc[1,'sstart']}-{df_main.loc[1,'send']}"] = df_main.loc[1, "sseq"]
test_seq_1

{'Seq_LinJ.01_plus_24093-25080': 'GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCCCTCCACCCCCGCTCTCTCTGTGTACGGAAGCCCGGCAGCCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGGCTGGGCCCTTACGGCGTTGGGCGAGGTCGGCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCGCTGCGTCCGAGCCACCCGCGACAGTGAGCACAGGCTTGCACGGTCCATGCGATGGGCGGAGCGTGTCAGCGCGGCTCGAGCGTGTCGCAGCCGGCCCCTCACACTGGCCCACGGCGAGGGGTGCGGGGGCCTGAGTGTCACCGCGACGGGGAGACGCACCCAGGGAGGAGGTGGGGGGAGTGGGGACCGGCATGATGGAGGGGCGGCTGTGTGGCGATGTGTGTGTATGTGGGTGTCGTCGCGTTTGAGGCAGGAGCCGTGCTGCGACGACCGAGCCGGCGCACTGCTGCAGCGCGCGTGTGTCTTGCGCTGCTTCGCACCAGGCGATGAGAGTGGGGTGGGGTGCCTGCAGCACTCGGCGGCGGGGGGGTGCAGAGCGGCCTCCACCTCGCAGTGTGCGGCAGCGAAAACGGACACGCGGAAGAGCAAACACCAAGCCCGCACCCCTCTCCTCAGCCTCAGGAACGGGCTCCTAAAACGGCTCGATGCTATCGAGCTCCGCCTGCGCAAAGGGCGCGGCACCATGCGCCACACCGCTCTCAGACAGGAGTCGCAAGATGGAGTTCACAGCAACGCGATACGACGGCGCGGCTACCTCTGTCAGCATATTGGTGCGGCGGTACGCCTCCATCGCGACGCGGCGCATCTCACTGTGCGAAGAGAGGCGCCGCTCCTCTGCGTCCTCAGGAGCACGCCTGCGGATACGCGCCACCCGCACCGCCTCATCGACGC

#### Test GFF and coor

In [102]:
def fasta_creator2(sequence, fasta_output_path):
    rec = SeqRecord(Seq(sequence),
                    id="test"
                    )
    SeqIO.write(rec, fasta_output_path, "fasta")

def gff_creator(data_input, outh_path):
    # take only needed values
    data = data_input[["qseqid", "qstart", "qend"]].copy()
    # Prepare GFF type df
    gff_df = pd.DataFrame({
        'seqname': "test",
        'source': "CBM-302",
        'feature': "SIDER_test",  # Ensure this is the correct feature name
        'start': data.loc[:,"qstart"],
        'end': data.loc[:,"qend"],
        'score': ".",  # Placeholder if no score data
        'strand': ".",
        'frame': ".", # Placeholder if no frame data
        'attribute': "."
          })
    # Save to file
    gff_df.to_csv(outh_path, sep="\t", header=False, index=False)

def df_blasting_gff(dict, genome_path):  # version with csv
    name_id = list(dict.keys())[0]
    seq = list(dict.values())[0]
    query = f"<(echo -e '>{name_id}\n{seq}')"  # Create a query in a bash tmp file
    data = blastn_blaster(query_path=query, dict_path=genome_path)
    data = data[(data["length"] >= 100) & (data["length"] <= 1500)]  # Filter by length
    return data

def gff_viewer_repeated(main_df, slice_num, path_folder, path_genome, repeat_num):
    main_dict = {}
    main_dict[f"Seq_{main_df.loc[slice_num,'sseqid']}_{main_df.loc[slice_num,'sstrand']}_{main_df.loc[slice_num,'sstart']}-{main_df.loc[slice_num,'send']}"] = main_df.loc[slice_num, "sseq"]
    print(f"Analyzing sequence {slice_num} with length {len(main_df.loc[slice_num, 'sseq'])}")

    folder_name = os.path.join(path_folder, f"Seq_{main_df.loc[slice_num,'sseqid']}_{main_df.loc[slice_num,'sstrand']}_{main_df.loc[slice_num,'sstart']}-{main_df.loc[slice_num,'send']}")
    print(f"Creating folder {folder_name}")
    os.makedirs(folder_name, exist_ok=True)
    path_out_fasta = os.path.join(folder_name, "test.fasta")
    fasta_creator2(sequence=list(main_dict.values())[0], 
                   fasta_output_path=path_out_fasta)
    
    blastn_df = df_blasting_gff(dict=main_dict, 
                                genome_path=path_genome)
    q2_evalue = blastn_df["evalue"].describe()["50%"]  # Median

    path_gff = os.path.join(folder_name, "test_None.gff")
    gff_creator(blastn_df, path_gff)

    for i in range(repeat_num):
        print(f"Round {i+1} with Q2 = {q2_evalue}")
        blastn_df = blastn_df[blastn_df["evalue"] <= q2_evalue]
        path_gff = os.path.join(folder_name, f"test_Round_{i+1}-{q2_evalue}.gff")
        gff_creator(blastn_df, path_gff)
        q2_evalue = blastn_df["evalue"].describe()["50%"]  # Median; rewrite the value for the iteration


In [88]:
df_main.head()

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...,806
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927


In [93]:
gff_viewer_repeated(main_df=df_main, 
                    slice_num=1, 
                    path_folder=".tests/", 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 10 iterations

Analyzing sequence 1 with length 988
Round 1 with Q2 = 1.83e-09
Round 2 with Q2 = 6.53e-14
Round 3 with Q2 = 2.33e-18
Round 4 with Q2 = 1.39e-20
Round 5 with Q2 = 1.79e-24
Round 6 with Q2 = 1.1435e-24
Round 7 with Q2 = 3.74e-46
Round 8 with Q2 = 0.0
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


let's check it with a small sequence

In [104]:
df_main[df_main["length"] < 200]

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
31,LinJ.03,237552,237702,plus,GTGCGGGGGAGCCAGGCAGCCCACCCACCCACCCATCCCCTATCCC...,151
77,LinJ.06,102999,103131,plus,GCTCCCGGTGTCCGCAGGACAGGTCTGGACTGGCGGTATGCCGAAG...,133
95,LinJ.06,513017,513205,plus,CCCCACCCGGCCTGTAGCACGCCCCATCGGCTGCTGCAAAGCAGCC...,189
120,LinJ.07,464303,464453,plus,GGGGGAGGCCAAGCGGCCTGCAGCCGGCCCTTGGGCGCGACTGCGG...,151
130,LinJ.07,592720,592866,plus,TGGGGAAGGGTTTTTCTCTCCACTGATTCCTTGCACAGCGACACCC...,147
...,...,...,...,...,...,...
1869,LinJ.35,1074531,1074700,plus,CCTGTCACAGACCCCATCGCGTGGTGCGAGGCAGCCGTGGCCGCAC...,170
1937,LinJ.36,11081,11196,plus,TCTTCTCGGGCGAATACCGTCTGCCGTCGAATCGGCTCAACGCGTC...,116
1940,LinJ.36,25331,25446,plus,TCTTCTCGGGCGAATACCGTCTGCCGTCGAATCGGCTCAACGCGTC...,116
1962,LinJ.36,319917,320017,plus,GCCCCCTGATGACGAGGGAGCATGCCCGTGCGTGGTATCACGGGGC...,101


In [105]:
gff_viewer_repeated(main_df=df_main, 
                    slice_num=2009, 
                    path_folder=".tests/", 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations

Analyzing sequence 2009 with length 198
Creating folder .tests/Seq_LinJ.36_plus_1063558-1063755
Round 1 with Q2 = 1.54e-08
Round 2 with Q2 = 5.52e-13
Round 3 with Q2 = 3.3e-15
Round 4 with Q2 = 7.09e-17
Round 5 with Q2 = 1.52e-18
Round 6 with Q2 = 9.100000000000001e-21
Round 7 with Q2 = 9.040000000000001e-26
Round 8 with Q2 = 4.5200448500000005e-26
Round 9 with Q2 = 4.485e-31
Round 10 with Q2 = 2.26e-101


### Test sequence big

In [95]:
long_seqs_df = long_seqs.copy()
long_seqs_df

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
10,LinJ.01,145322,146653,plus,ATCCCCACGGGGTGGTGAGGCGAGAAGTCAGGGGTCGGGCACGCGC...,1332
...,...,...,...,...,...,...
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
2075,LinJ.36,2504116,2505057,plus,TGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCTGCC...,942
2076,LinJ.36,2533606,2534550,plus,CAACGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCT...,945
2081,LinJ.36,2600817,2601884,plus,GCGCATGCCGAGCACCGCTGGCATGTGGTGTGCCGCATCCGACCGA...,1068


In [97]:
long_seqs_df[long_seqs_df["length"] > 1500].sort_values(by="length", ascending=False)

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
1709,LinJ.34,813947,824492,plus,GGGACGGATAGACGGGGAGAGACATGCGGGGGAGGCAGTGGTGACG...,10546
1708,LinJ.34,808220,813854,plus,ACGACGCAGCACAGCGGACGAGGAGAGAGGAGGCGGTAGGCAGGAG...,5635
610,LinJ.19,608398,612636,plus,GCTACTGCTATGTCGGCGCTTAGGCCGTGGGTGGGAGCTGCATTGG...,4239
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
1749,LinJ.34,1242905,1247000,plus,GACGCGCGCCTCCCCTCGACCCCCGCTGTCGCACACGGCATGCGCG...,4096
...,...,...,...,...,...,...
1934,LinJ.35,1995790,1997317,plus,CAGCCTCGTCGGTGGTGCTGGCGTGTGTCGTGGTTTTATTGCCGCT...,1528
1271,LinJ.30,497129,498641,plus,CTACGGAGGCCCTGCACCACGCCCTTCAAACCGCTCACGATGGGCG...,1513
1268,LinJ.30,490955,492467,plus,CTACGGAGGCCCTGCACCACGCCCTTCAAACCGCTCACGATGGGCG...,1513
1807,LinJ.35,114478,115984,plus,TTCTTCGCTTTTCGCTCTTCCTCTCTCGCCGTGATGATGCCGGCCA...,1507


In [98]:
gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1709, 
                    path_folder=".tests/", 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations

Analyzing sequence 1709 with length 10546
Round 1 with Q2 = 6.78e-48
Round 2 with Q2 = 1.4499999999999998e-54
Round 3 with Q2 = 0.0
Round 4 with Q2 = 0.0
Round 5 with Q2 = 0.0
Round 6 with Q2 = 0.0
Round 7 with Q2 = 0.0
Round 8 with Q2 = 0.0
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


In [99]:
gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1749, 
                    path_folder=".tests/", 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations

Analyzing sequence 1749 with length 4096
Round 1 with Q2 = 1.27e-16
Round 2 with Q2 = 3.73e-25
Round 3 with Q2 = 4.36e-51
Round 4 with Q2 = 8.859999999999999e-93
Round 5 with Q2 = 0.0
Round 6 with Q2 = 0.0
Round 7 with Q2 = 0.0
Round 8 with Q2 = 0.0
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


In [103]:
gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1271, 
                    path_folder=".tests/", 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations

Analyzing sequence 1271 with length 1513
Creating folder .tests/Seq_LinJ.30_plus_497129-498641
Round 1 with Q2 = 7.79e-15
Round 2 with Q2 = 1.2800000000000001e-27
Round 3 with Q2 = 1.6400000000000001e-31
Round 4 with Q2 = 1.61e-46
Round 5 with Q2 = 7.42e-50
Round 6 with Q2 = 1.58e-56
Round 7 with Q2 = 7.9000000000168e-57
Round 8 with Q2 = 3.36e-68
Round 9 with Q2 = 3.36e-68
Round 10 with Q2 = 3.36e-68
