# Code

## Needed modules

In [31]:
# Needed modules
import os
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Load data

In [4]:
# Load al the data
df_main = pd.read_csv("./data/consensus+LmSIDER2A+B/AL_DATA.csv", sep=",", header=0)
print(df_main.shape)
print(df_main.dtypes)
df_main.head()

(2092, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [7]:
# Create a column with the length of the sseq
df_main["length"] = df_main["sseq"].apply(lambda x: len(x))

In [8]:
# Check df
df_main.head()

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...,806
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927


In [15]:
# check dtypes
df_main.dtypes

sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
length      int64
dtype: object

In [9]:
# describe length column
df_main["length"].describe()

count     2092.000000
mean       827.937859
std        467.971245
min        101.000000
25%        649.000000
50%        770.000000
75%        921.000000
max      10546.000000
Name: length, dtype: float64

In [19]:
# Le'ts check the length of the elements above the third quartile
long_seqs = df_main[df_main["length"] > int(df_main["length"].describe()["75%"])].copy()
long_seqs

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
10,LinJ.01,145322,146653,plus,ATCCCCACGGGGTGGTGAGGCGAGAAGTCAGGGGTCGGGCACGCGC...,1332
...,...,...,...,...,...,...
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
2075,LinJ.36,2504116,2505057,plus,TGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCTGCC...,942
2076,LinJ.36,2533606,2534550,plus,CAACGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCT...,945
2081,LinJ.36,2600817,2601884,plus,GCGCATGCCGAGCACCGCTGGCATGTGGTGTGCCGCATCCGACCGA...,1068


In [25]:
long_seqs_values = pd.DataFrame(long_seqs["length"].value_counts(sort=False))

In [30]:
long_seqs_values.sort_index(inplace=True, ascending=False)
long_seqs_values

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
10546,1
5635,1
4239,1
4160,1
4096,1
...,...
927,5
926,2
924,1
923,3


Now I have two values:
- `df_main` ==> with the 2092 elements
- `long_seqs`==> subset of  `df_main` with the elements with a length >Q3

## Main code

### Prepare functions

In [34]:
# Prepare functions
# Needed functions
def fasta_creator(csv_input, output_path):
    matrix = []
    for index, row in csv_input.iterrows():
        rec = SeqRecord(Seq(row["sseq"]), 
                        id = f"Seq_{index}_{row['sseqid']}",
                        description = "Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, output_path, "fasta")

def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def blastn_blaster(query_path, dict_path):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -outfmt '10 qseqid sseqid pident qstart qend sstart send evalue bitscore length qlen qcovs slen'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    if not data.empty:  # If the dataframe is not empty
        data.columns = ["qseqid", "sseqid", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen"]
        data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen']] = data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen']].apply(pd.to_numeric)
    else:  # If the dataframe is empty
        data = pd.DataFrame(columns=["qseqid", "sseqid", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen"])  # Create an empty dataframe
    return data  

### Prepare paths

In [32]:
# Prepare paths
path_genome = "../0.Data/genome/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"

### Prepare blastn dicts

In [None]:
blastn_dic(path_input=path_genome, path_output=path_genome)

### Test