# Code

## Needed modules

In [2]:
# Needed modules
import os
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Load data

In [3]:
# Load al the data
df_main = pd.read_csv("./data/consensus+LmSIDER2A+B/AL_DATA.csv", sep=",", header=0)
print(df_main.shape)
print(df_main.dtypes)
df_main.head()

(2092, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


In [4]:
# Create a column with the length of the sseq
df_main["length"] = df_main["sseq"].apply(lambda x: len(x))

In [5]:
# Check df
df_main.head()

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...,806
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927


In [6]:
# check dtypes
df_main.dtypes

sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
length      int64
dtype: object

In [7]:
# describe length column
df_main["length"].describe()

count     2092.000000
mean       827.937859
std        467.971245
min        101.000000
25%        649.000000
50%        770.000000
75%        921.000000
max      10546.000000
Name: length, dtype: float64

In [8]:
# Le'ts check the length of the elements above the third quartile
long_seqs = df_main[df_main["length"] > int(df_main["length"].describe()["75%"])].copy()
long_seqs

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
10,LinJ.01,145322,146653,plus,ATCCCCACGGGGTGGTGAGGCGAGAAGTCAGGGGTCGGGCACGCGC...,1332
...,...,...,...,...,...,...
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
2075,LinJ.36,2504116,2505057,plus,TGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCTGCC...,942
2076,LinJ.36,2533606,2534550,plus,CAACGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCT...,945
2081,LinJ.36,2600817,2601884,plus,GCGCATGCCGAGCACCGCTGGCATGTGGTGTGCCGCATCCGACCGA...,1068


In [9]:
long_seqs_values = pd.DataFrame(long_seqs["length"].value_counts(sort=False))

In [10]:
long_seqs_values.sort_index(inplace=True, ascending=False)
long_seqs_values

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
10546,1
5635,1
4239,1
4160,1
4096,1
...,...
927,5
926,2
924,1
923,3


Now I have two values:
- `df_main` ==> with the 2092 elements
- `long_seqs`==> subset of  `df_main` with the elements with a length >Q3

## Main code

### Prepare functions

In [11]:
# Prepare functions
# Needed functions
def fasta_creator(csv_input, output_path):
    matrix = []
    for index, row in csv_input.iterrows():
        rec = SeqRecord(Seq(row["sseq"]), 
                        id = f"Seq_{index}_{row['sseqid']}",
                        description = "Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, output_path, "fasta")

def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def blastn_blaster(query_path, dict_path):
    cmd = "blastn -word_size 11" \
        + " -query " + query_path \
        + " -db " + dict_path \
        + " -outfmt '10 qseqid sseqid sstrand pident qstart qend sstart send evalue bitscore length qlen qcovs slen mismatch gapopen gaps'"
    data = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable='/usr/bin/bash')  # Important the E value
    data = data.stdout
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    if not data.empty:  # If the dataframe is not empty
        data.columns = ["qseqid", "sseqid", "sstrand", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen", "mismatch", "gapopen", "gaps"]
        data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen', 'mismatch', 'gapopen', 'gaps']] = data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen', 'mismatch', 'gapopen', 'gaps']].apply(pd.to_numeric)
    else:  # If the dataframe is empty
        data = pd.DataFrame(columns=["qseqid", "sseqid", "sstrand", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen", "mismatch", "gapopen", "gaps"])  # Create an empty dataframe
    return data  

### Prepare paths

In [12]:
# Prepare paths
path_genome = "../0.Data/genome/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"

### Prepare blastn dicts

In [13]:
blastn_dic(path_input=path_genome, path_output=path_genome)

### Test normal sequence

In [14]:
df_main.iloc[1,:]

sseqid                                               LinJ.01
sstart                                                 24093
send                                                   25080
sstrand                                                 plus
sseq       GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
length                                                   988
Name: 1, dtype: object

In [15]:
test_seq_1 = {}
test_seq_1[f"Seq_{df_main.loc[1,'sseqid']}_{df_main.loc[1,'sstrand']}_{df_main.loc[1,'sstart']}-{df_main.loc[1,'send']}"] = df_main.loc[1, "sseq"]
test_seq_1

{'Seq_LinJ.01_plus_24093-25080': 'GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCCCTCCACCCCCGCTCTCTCTGTGTACGGAAGCCCGGCAGCCCCCTCACCCTCTATCCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGGCTGGGCCCTTACGGCGTTGGGCGAGGTCGGCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCGCTGCGTCCGAGCCACCCGCGACAGTGAGCACAGGCTTGCACGGTCCATGCGATGGGCGGAGCGTGTCAGCGCGGCTCGAGCGTGTCGCAGCCGGCCCCTCACACTGGCCCACGGCGAGGGGTGCGGGGGCCTGAGTGTCACCGCGACGGGGAGACGCACCCAGGGAGGAGGTGGGGGGAGTGGGGACCGGCATGATGGAGGGGCGGCTGTGTGGCGATGTGTGTGTATGTGGGTGTCGTCGCGTTTGAGGCAGGAGCCGTGCTGCGACGACCGAGCCGGCGCACTGCTGCAGCGCGCGTGTGTCTTGCGCTGCTTCGCACCAGGCGATGAGAGTGGGGTGGGGTGCCTGCAGCACTCGGCGGCGGGGGGGTGCAGAGCGGCCTCCACCTCGCAGTGTGCGGCAGCGAAAACGGACACGCGGAAGAGCAAACACCAAGCCCGCACCCCTCTCCTCAGCCTCAGGAACGGGCTCCTAAAACGGCTCGATGCTATCGAGCTCCGCCTGCGCAAAGGGCGCGGCACCATGCGCCACACCGCTCTCAGACAGGAGTCGCAAGATGGAGTTCACAGCAACGCGATACGACGGCGCGGCTACCTCTGTCAGCATATTGGTGCGGCGGTACGCCTCCATCGCGACGCGGCGCATCTCACTGTGCGAAGAGAGGCGCCGCTCCTCTGCGTCCTCAGGAGCACGCCTGCGGATACGCGCCACCCGCACCGCCTCATCGACGC

#### Test GFF and coor

In [30]:
def fasta_creator2(sequence, fasta_output_path):
    rec = SeqRecord(Seq(sequence),
                    id="test"
                    )
    SeqIO.write(rec, fasta_output_path, "fasta")

def gff_creator(data_input, outh_path):
    # take only needed values
    data = data_input[["qseqid", "qstart", "qend"]].copy()
    # Prepare GFF type df
    gff_df = pd.DataFrame({
        'seqname': "test",
        'source': "CBM-302",
        'feature': "SIDER_test",  # Ensure this is the correct feature name
        'start': data.loc[:,"qstart"],
        'end': data.loc[:,"qend"],
        'score': ".",  # Placeholder if no score data
        'strand': ".",
        'frame': ".", # Placeholder if no frame data
        'attribute': "."
          })
    # Save to file
    gff_df.to_csv(outh_path, sep="\t", header=False, index=False)


def get_data_sequence(seq, genome_fasta, strand, start_coor, end_coor, name_chr):
    print(f"\t- Original coordinates: {start_coor}-{end_coor} ==> length: {end_coor - start_coor + 1}")
    start_coor = start_coor - 50 if start_coor - 50 > 0 else 1
    end_coor = end_coor + 50
    print(f"\t- New coordinates: {start_coor}-{end_coor} ==> length: {end_coor - start_coor + 1}")
    
    cmd = f"blastdbcmd -db {genome_fasta} -entry {name_chr} -range {start_coor}-{end_coor} -strand {strand} -outfmt %s"
    data = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable='/usr/bin/bash')
    data = data.stdout
    data = data.strip() # Remove the last \n
    return data  # Return the sequence


def df_blasting_gff(dict, genome_path):  # version with csv
    name_id = list(dict.keys())[0]
    seq = list(dict.values())[0]
    query = f"<(echo -e '>{name_id}\n{seq}')"  # Create a query in a bash tmp file

    data = blastn_blaster(query_path=query, dict_path=genome_path)

    return data

def gff_viewer_repeated(main_df, slice_num, path_folder, path_genome):
    main_dict = {}
    seq_name = f"Seq_{main_df.loc[slice_num,'sseqid']}_{main_df.loc[slice_num,'sstrand']}_{main_df.loc[slice_num,'sstart']}-{main_df.loc[slice_num,'send']}"
    main_dict[seq_name] = main_df.loc[slice_num, "sseq"]
    print(f"Analyzing sequence {slice_num} with length {len(main_df.loc[slice_num, 'sseq'])}")

    start_coord = main_df.loc[slice_num, "sstart"]
    end_coord = main_df.loc[slice_num, "send"]
    strand = main_df.loc[slice_num, "sstrand"]
    name_chr = main_df.loc[slice_num, "sseqid"]
    len_seq = abs(start_coord - end_coord) + 1  ## should be int

    folder_name = os.path.join(path_folder, f"Seq_{main_df.loc[slice_num,'sseqid']}_{main_df.loc[slice_num,'sstrand']}_{main_df.loc[slice_num,'sstart']}-{main_df.loc[slice_num,'send']}_size-{len_seq}")
    print(f"Creating folder {folder_name}")
    os.makedirs(folder_name, exist_ok=True)
    path_out_fasta = os.path.join(folder_name, f"{os.path.basename(folder_name)}.fasta")
    fasta_creator2(sequence=list(main_dict.values())[0], 
                   fasta_output_path=path_out_fasta)
    
    print(f"\t- Start coor: {start_coord}\n",
          f"\t- End coor: {end_coord}\n",
          f"\t- chr name: {name_chr}")
    blastn_df = df_blasting_gff(dict=main_dict, 
                                genome_path=path_genome)
    # remove the row with the same sstart and send values thand start_coor and end_coor in plus way
    blastn_df = blastn_df[~(
        ((blastn_df["sstart"] >= start_coord) & (blastn_df["sstart"] <= end_coord)) |  # sstart is within the start and end coordinates
        ((blastn_df["send"] <= end_coord) & (blastn_df["send"] >= start_coord)) &      # send is within the start and end coordinates
        (blastn_df["sseqid"] == name_chr) &                                            # sseqid matches name_chr
        (blastn_df["sstrand"] == "plus")                                               # sstrand is "plus"
    )].copy()
    
    # do the same but for the minus one, where the coor are inverted
    blastn_df = blastn_df[~(
        ((blastn_df["sstart"] <= end_coord) & (blastn_df["sstart"] >= start_coord)) |
        ((blastn_df["send"] >= start_coord) & (blastn_df["send"] <= end_coord)) &
        (blastn_df["sseqid"] == name_chr) &
        (blastn_df["sstrand"] == "minus"))].copy()
    
    path_gff = os.path.join(folder_name, "test_None.gff")
    gff_creator(blastn_df, path_gff)
    return blastn_df, start_coord, end_coord, seq_name


In [34]:
path_bedops_folder = "./test_coordinates/"

# Correcting coordinates
def bedops_merge(main_df, slice_num, path_folder, path_genome):
    data_df,start_coord, end_coord, seq_name = gff_viewer_repeated(main_df=main_df, slice_num=slice_num, path_folder=path_folder, path_genome=path_genome)
    path_bedops_file = os.path.join(path_bedops_folder, "tmp.bed")
    data_bedops = data_df[['qstart', 'qend']].copy()  # in qstart and qend I don't have the "minus" coordinates problem
    data_bedops.insert(0, 'new_column', 'test')  # Add a new column with every row with the same value 'test'
    data_bedops.to_csv(path_bedops_file, sep="\t", header=False, index=False)
    cmd = f"bedops --merge {path_bedops_file}"
    data = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable='/usr/bin/bash')
    data = data.stdout   # Get the output
    data = pd.DataFrame([x.split("\t") for x in data.split("\n") if x], columns=['sseqid', 'qstart', 'qend'])
    data[['qstart', 'qend']] = data[['qstart', 'qend']].apply(pd.to_numeric)

    return data, start_coord, end_coord, seq_name

def corrected_coordinates(main_df, slice_num, path_folder, path_genome):
    data, start_coord, end_coord, seq_name = bedops_merge(main_df, slice_num, path_folder, path_genome)
    seq_dict = {}
    seq_dict[seq_name] = []
    more_than_one = {}
    for _, row in data.iterrows():
        new_start = start_coord + row["qstart"]
        new_end = end_coord + row["qend"]
        if abs(new_end - new_start) + 1 > 100:
            seq_dict[seq_name].append([new_start, new_end])  # Should add chromosome name and strand
        else:  # If the length is less than 100
            continue  # Skip this iteration
    if len(seq_dict.values()) > 1:  # If there are more than one element in the dictionary
        more_than_one[seq_name] = seq_dict[seq_name].keys()  # Save the keys
    return seq_dict, more_than_one

In [21]:
path_folder_test = "./test_coordinates/"

In [22]:
df_main.head()

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...,806
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927


In [35]:
corrected_coordinates(main_df=df_main, slice_num=1, path_folder=path_folder_test, path_genome=path_genome)

Analyzing sequence 1 with length 988
Creating folder ./test_coordinates/Seq_LinJ.01_plus_24093-25080_size-988
	- Start coor: 24093
 	- End coor: 25080
 	- chr name: LinJ.01


({'Seq_LinJ.01_plus_24093-25080': [[24116, 25746], [25031, 26052]]}, {})

In [115]:
test_988_df = gff_viewer_repeated(main_df=df_main, 
                    slice_num=1, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 10 iterations
test_988_df["qlength"] = abs(test_988_df["qstart"] - test_988_df["qend"]) + 1
test_988_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 1 with length 988
Creating folder ./test_no_filter/Seq_LinJ.01_plus_24093-25080_size-988
	- Start coor: 24093
 	- End coor: 25080
 	- chr name: LinJ.01
Round 1 with Q2 = 0.000662
Round 2 with Q2 = 8.5e-08
Round 3 with Q2 = 1.09e-11
Round 4 with Q2 = 1.157e-14
Round 5 with Q2 = 2.33e-18
Round 6 with Q2 = 1.39e-20
Round 7 with Q2 = 1.79e-24
Round 8 with Q2 = 1.79e-24
Round 9 with Q2 = 1.79e-24
Round 10 with Q2 = 1.79e-24


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.01_plus_24093-25080,LinJ.01,minus,99.534,23,666,55528,54885,0.000000e+00,1173.0,644,988,100,278267,3,0,0,644
2,Seq_LinJ.01_plus_24093-25080,LinJ.01,minus,99.192,7,624,35933,35316,0.000000e+00,1114.0,619,988,100,278267,3,2,2,618
37,Seq_LinJ.01_plus_24093-25080,LinJ.14,plus,70.105,98,563,558002,558454,6.570000e-09,62.1,475,988,53,656132,111,23,31,466
38,Seq_LinJ.01_plus_24093-25080,LinJ.14,plus,69.895,98,563,584400,584853,8.500000e-08,58.4,475,988,53,656132,113,23,30,466
3,Seq_LinJ.01_plus_24093-25080,LinJ.01,plus,98.441,30,477,75923,76366,0.000000e+00,785.0,449,988,100,278267,1,5,6,448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,Seq_LinJ.01_plus_24093-25080,LinJ.24,minus,100.000,628,644,705341,705325,5.200000e+00,32.5,17,988,55,863815,0,0,0,17
261,Seq_LinJ.01_plus_24093-25080,LinJ.24,minus,100.000,628,644,715899,715883,5.200000e+00,32.5,17,988,55,863815,0,0,0,17
262,Seq_LinJ.01_plus_24093-25080,LinJ.24,minus,100.000,628,644,730052,730036,5.200000e+00,32.5,17,988,55,863815,0,0,0,17
263,Seq_LinJ.01_plus_24093-25080,LinJ.24,minus,100.000,628,644,747350,747334,5.200000e+00,32.5,17,988,55,863815,0,0,0,17


In [116]:
# Check sequences between 1400 and 1500
df_main[(df_main["length"] >= 1400) & (df_main["length"] <= 1500)].head()

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
28,LinJ.02,335887,337318,plus,ACACGCGCAAGTCCATCATTATACCACACATTCCACTGAACCCCCA...,1432
159,LinJ.08,445073,446504,plus,AAGTCGGCCATCCCAACACGGCGCTACAGCCCAGTGCCCGGCGTTT...,1432
168,LinJ.09,41734,43149,plus,CTGTGTTTATTATGAGGGCAGCTGACTCGGCGTTGTACCCGGTGTG...,1416
213,LinJ.10,189581,191014,plus,TCGACCATCTGCTGCGCCGCGTCCTTCTGGGCCCTGCCGCTGGTGA...,1434
218,LinJ.10,264127,265602,plus,ACGCGCAAGTCCATCATTACCACATTCCATTGATCCCCGACTTCGC...,1476


In [117]:
test_1476_df = gff_viewer_repeated(main_df=df_main, 
                    slice_num=218, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 10 iterations
test_1476_df["qlength"] = abs(test_1476_df["qstart"] - test_1476_df["qend"]) + 1
test_1476_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 218 with length 1476
Creating folder ./test_no_filter/Seq_LinJ.10_plus_264127-265602_size-1476
	- Start coor: 264127
 	- End coor: 265602
 	- chr name: LinJ.10
Round 1 with Q2 = 0.17
Round 2 with Q2 = 1.6200000000000002e-21
Round 3 with Q2 = 2.44e-94
Round 4 with Q2 = 3.0699999999999997e-113
Round 5 with Q2 = 5.3500000885e-133
Round 6 with Q2 = 3.1600000000241e-145
Round 7 with Q2 = 3.67e-167
Round 8 with Q2 = 1.835000392e-167
Round 9 with Q2 = 7.84e-174
Round 10 with Q2 = 7.84e-174


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.10_plus_264127-265602,LinJ.34,minus,81.188,36,811,519849,519054,7.840000e-174,610.0,808,1476,84,1852095,108,30,44,776
11,Seq_LinJ.10_plus_264127-265602,LinJ.33,minus,83.113,152,803,93749,93076,3.670000e-167,588.0,681,1476,69,1532318,79,19,36,652
14,Seq_LinJ.10_plus_264127-265602,LinJ.32,minus,81.213,156,803,497365,496696,6.320000e-145,514.0,676,1476,92,1544760,93,18,34,648
26,Seq_LinJ.10_plus_264127-265602,LinJ.29,plus,81.918,98,708,556678,557304,1.060000e-142,507.0,636,1476,47,1272419,81,17,34,611
27,Seq_LinJ.10_plus_264127-265602,LinJ.29,plus,81.117,98,681,585573,586174,3.010000e-128,459.0,609,1476,47,1272419,83,16,32,584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,Seq_LinJ.10_plus_264127-265602,LinJ.31,plus,100.000,1432,1448,893534,893550,7.800000e+00,32.5,17,1476,9,1529246,0,0,0,17
118,Seq_LinJ.10_plus_264127-265602,LinJ.23,plus,100.000,1367,1383,533729,533745,7.800000e+00,32.5,17,1476,5,786683,0,0,0,17
58,Seq_LinJ.10_plus_264127-265602,LinJ.27,plus,100.000,1301,1317,889787,889803,7.800000e+00,32.5,17,1476,25,1175412,0,0,0,17
54,Seq_LinJ.10_plus_264127-265602,LinJ.20,plus,100.000,137,153,94179,94195,7.800000e+00,32.5,17,1476,32,731244,0,0,0,17


In [118]:
test_1476_df = gff_viewer_repeated(main_df=df_main, 
                    slice_num=218, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 10 iterations
test_1476_df["qlength"] = abs(test_1476_df["qstart"] - test_1476_df["qend"]) + 1
test_1476_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 218 with length 1476
Creating folder ./test_no_filter/Seq_LinJ.10_plus_264127-265602_size-1476
	- Start coor: 264127
 	- End coor: 265602
 	- chr name: LinJ.10


Round 1 with Q2 = 0.17
Round 2 with Q2 = 1.6200000000000002e-21
Round 3 with Q2 = 2.44e-94
Round 4 with Q2 = 3.0699999999999997e-113
Round 5 with Q2 = 5.3500000885e-133
Round 6 with Q2 = 3.1600000000241e-145
Round 7 with Q2 = 3.67e-167
Round 8 with Q2 = 1.835000392e-167
Round 9 with Q2 = 7.84e-174
Round 10 with Q2 = 7.84e-174


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.10_plus_264127-265602,LinJ.34,minus,81.188,36,811,519849,519054,7.840000e-174,610.0,808,1476,84,1852095,108,30,44,776
11,Seq_LinJ.10_plus_264127-265602,LinJ.33,minus,83.113,152,803,93749,93076,3.670000e-167,588.0,681,1476,69,1532318,79,19,36,652
14,Seq_LinJ.10_plus_264127-265602,LinJ.32,minus,81.213,156,803,497365,496696,6.320000e-145,514.0,676,1476,92,1544760,93,18,34,648
26,Seq_LinJ.10_plus_264127-265602,LinJ.29,plus,81.918,98,708,556678,557304,1.060000e-142,507.0,636,1476,47,1272419,81,17,34,611
27,Seq_LinJ.10_plus_264127-265602,LinJ.29,plus,81.117,98,681,585573,586174,3.010000e-128,459.0,609,1476,47,1272419,83,16,32,584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,Seq_LinJ.10_plus_264127-265602,LinJ.31,plus,100.000,1432,1448,893534,893550,7.800000e+00,32.5,17,1476,9,1529246,0,0,0,17
118,Seq_LinJ.10_plus_264127-265602,LinJ.23,plus,100.000,1367,1383,533729,533745,7.800000e+00,32.5,17,1476,5,786683,0,0,0,17
58,Seq_LinJ.10_plus_264127-265602,LinJ.27,plus,100.000,1301,1317,889787,889803,7.800000e+00,32.5,17,1476,25,1175412,0,0,0,17
54,Seq_LinJ.10_plus_264127-265602,LinJ.20,plus,100.000,137,153,94179,94195,7.800000e+00,32.5,17,1476,32,731244,0,0,0,17


let's check it with a small sequence

In [119]:
df_main[df_main["length"] < 200]

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
31,LinJ.03,237552,237702,plus,GTGCGGGGGAGCCAGGCAGCCCACCCACCCACCCATCCCCTATCCC...,151
77,LinJ.06,102999,103131,plus,GCTCCCGGTGTCCGCAGGACAGGTCTGGACTGGCGGTATGCCGAAG...,133
95,LinJ.06,513017,513205,plus,CCCCACCCGGCCTGTAGCACGCCCCATCGGCTGCTGCAAAGCAGCC...,189
120,LinJ.07,464303,464453,plus,GGGGGAGGCCAAGCGGCCTGCAGCCGGCCCTTGGGCGCGACTGCGG...,151
130,LinJ.07,592720,592866,plus,TGGGGAAGGGTTTTTCTCTCCACTGATTCCTTGCACAGCGACACCC...,147
...,...,...,...,...,...,...
1869,LinJ.35,1074531,1074700,plus,CCTGTCACAGACCCCATCGCGTGGTGCGAGGCAGCCGTGGCCGCAC...,170
1937,LinJ.36,11081,11196,plus,TCTTCTCGGGCGAATACCGTCTGCCGTCGAATCGGCTCAACGCGTC...,116
1940,LinJ.36,25331,25446,plus,TCTTCTCGGGCGAATACCGTCTGCCGTCGAATCGGCTCAACGCGTC...,116
1962,LinJ.36,319917,320017,plus,GCCCCCTGATGACGAGGGAGCATGCCCGTGCGTGGTATCACGGGGC...,101


In [120]:
test_198_df = gff_viewer_repeated(main_df=df_main, 
                    slice_num=2009, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations
test_198_df["qlength"] = abs(test_198_df["qstart"] - test_198_df["qend"]) + 1
test_198_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 2009 with length 198
Creating folder ./test_no_filter/Seq_LinJ.36_plus_1063558-1063755_size-198
	- Start coor: 1063558
 	- End coor: 1063755
 	- chr name: LinJ.36
Round 1 with Q2 = 0.000433
Round 2 with Q2 = 5.56e-08
Round 3 with Q2 = 9.23e-11
Round 4 with Q2 = 1.53e-13
Round 5 with Q2 = 9.17e-16
Round 6 with Q2 = 5.48e-18
Round 7 with Q2 = 4.24e-19
Round 8 with Q2 = 4.902e-21
Round 9 with Q2 = 9.040000000000001e-26
Round 10 with Q2 = 9.040000000000001e-26


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
79,Seq_LinJ.36_plus_1063558-1063755,LinJ.31,minus,72.906,1,198,182404,182207,1.190000e-09,62.1,203,198,100,1529246,45,10,10,198
78,Seq_LinJ.36_plus_1063558-1063755,LinJ.31,plus,73.684,2,198,1084907,1085112,2.570000e-11,67.6,209,198,100,1529246,40,13,15,197
10,Seq_LinJ.36_plus_1063558-1063755,LinJ.36,plus,74.000,6,198,83494,83687,7.140000e-12,69.4,200,198,100,2743073,39,12,13,193
247,Seq_LinJ.36_plus_1063558-1063755,LinJ.02,minus,74.384,1,192,314777,314576,4.270000e-14,76.8,203,198,97,356299,40,11,12,192
248,Seq_LinJ.36_plus_1063558-1063755,LinJ.02,minus,74.384,1,192,344218,344017,4.270000e-14,76.8,203,198,97,356299,40,11,12,192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,Seq_LinJ.36_plus_1063558-1063755,LinJ.20,plus,100.000,90,105,385223,385238,3.400000e+00,30.7,16,198,91,731244,0,0,0,16
387,Seq_LinJ.36_plus_1063558-1063755,LinJ.28,plus,100.000,93,108,732704,732719,3.400000e+00,30.7,16,198,89,1205031,0,0,0,16
193,Seq_LinJ.36_plus_1063558-1063755,LinJ.15,plus,100.000,84,99,12562,12577,3.400000e+00,30.7,16,198,61,650324,0,0,0,16
149,Seq_LinJ.36_plus_1063558-1063755,LinJ.34,plus,100.000,154,169,1108723,1108738,3.400000e+00,30.7,16,198,96,1852095,0,0,0,16


### Test sequence big

In [121]:
long_seqs_df = long_seqs.copy()
long_seqs_df

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
0,LinJ.01,1,1000,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...,1000
1,LinJ.01,24093,25080,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...,988
2,LinJ.01,35371,36297,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
4,LinJ.01,54983,55909,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...,927
10,LinJ.01,145322,146653,plus,ATCCCCACGGGGTGGTGAGGCGAGAAGTCAGGGGTCGGGCACGCGC...,1332
...,...,...,...,...,...,...
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
2075,LinJ.36,2504116,2505057,plus,TGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCTGCC...,942
2076,LinJ.36,2533606,2534550,plus,CAACGTGTCCATTCTCTGCCACACAACATGAGCTAAGCTCTACTCT...,945
2081,LinJ.36,2600817,2601884,plus,GCGCATGCCGAGCACCGCTGGCATGTGGTGTGCCGCATCCGACCGA...,1068


In [122]:
long_seqs_df[long_seqs_df["length"] > 1500].sort_values(by="length", ascending=False)

Unnamed: 0,sseqid,sstart,send,sstrand,sseq,length
1709,LinJ.34,813947,824492,plus,GGGACGGATAGACGGGGAGAGACATGCGGGGGAGGCAGTGGTGACG...,10546
1708,LinJ.34,808220,813854,plus,ACGACGCAGCACAGCGGACGAGGAGAGAGGAGGCGGTAGGCAGGAG...,5635
610,LinJ.19,608398,612636,plus,GCTACTGCTATGTCGGCGCTTAGGCCGTGGGTGGGAGCTGCATTGG...,4239
2074,LinJ.36,2498408,2502567,plus,ACAACAAAACTGACGCTATTGAAAGCGGCTCTCGAGAAGCTTTCCT...,4160
1749,LinJ.34,1242905,1247000,plus,GACGCGCGCCTCCCCTCGACCCCCGCTGTCGCACACGGCATGCGCG...,4096
...,...,...,...,...,...,...
1934,LinJ.35,1995790,1997317,plus,CAGCCTCGTCGGTGGTGCTGGCGTGTGTCGTGGTTTTATTGCCGCT...,1528
1271,LinJ.30,497129,498641,plus,CTACGGAGGCCCTGCACCACGCCCTTCAAACCGCTCACGATGGGCG...,1513
1268,LinJ.30,490955,492467,plus,CTACGGAGGCCCTGCACCACGCCCTTCAAACCGCTCACGATGGGCG...,1513
1807,LinJ.35,114478,115984,plus,TTCTTCGCTTTTCGCTCTTCCTCTCTCGCCGTGATGATGCCGGCCA...,1507


In [123]:
test_10546_df = gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1709, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations
test_10546_df["qlength"] = abs(test_10546_df["qstart"] - test_10546_df["qend"]) + 1
test_10546_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 1709 with length 10546
Creating folder ./test_no_filter/Seq_LinJ.34_plus_813947-824492_size-10546
	- Start coor: 813947
 	- End coor: 824492
 	- chr name: LinJ.34
Round 1 with Q2 = 0.026
Round 2 with Q2 = 4.29e-10
Round 3 with Q2 = 2.4400000000000003e-47
Round 4 with Q2 = 1.4499999999999998e-54
Round 5 with Q2 = 0.0
Round 6 with Q2 = 0.0
Round 7 with Q2 = 0.0
Round 8 with Q2 = 0.0
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.34_plus_813947-824492,LinJ.34,plus,97.977,1,10546,811193,821726,0.0,18279.0,10576,10546,100,1852095,142,55,72,10546
2,Seq_LinJ.34_plus_813947-824492,LinJ.34,plus,96.133,1,10546,808439,818958,0.0,17180.0,10603,10546,100,1852095,270,106,140,10546
3,Seq_LinJ.34_plus_813947-824492,LinJ.34,plus,99.620,1,8418,816714,825134,0.0,15370.0,8423,10546,100,1852095,25,7,7,8418
4,Seq_LinJ.34_plus_813947-824492,LinJ.34,plus,94.922,2549,10546,808220,816191,0.0,12480.0,8054,10546,100,1852095,271,104,138,7998
5,Seq_LinJ.34_plus_813947-824492,LinJ.34,plus,99.523,1,5651,819481,825134,0.0,10288.0,5655,10546,100,1852095,22,5,5,5651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,Seq_LinJ.34_plus_813947-824492,LinJ.11,minus,100.000,9868,9886,401292,401274,4.4,36.2,19,10546,7,568611,0,0,0,19
1436,Seq_LinJ.34_plus_813947-824492,LinJ.35,plus,100.000,1142,1160,186153,186171,4.4,36.2,19,10546,11,2019672,0,0,0,19
1437,Seq_LinJ.34_plus_813947-824492,LinJ.35,plus,100.000,3909,3927,186153,186171,4.4,36.2,19,10546,11,2019672,0,0,0,19
1438,Seq_LinJ.34_plus_813947-824492,LinJ.35,plus,100.000,6676,6694,186153,186171,4.4,36.2,19,10546,11,2019672,0,0,0,19


In [124]:
test_4096_df = gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1749, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations
test_4096_df["qlength"] = abs(test_4096_df["qstart"] - test_4096_df["qend"]) + 1
test_4096_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 1749 with length 4096
Creating folder ./test_no_filter/Seq_LinJ.34_plus_1242905-1247000_size-4096
	- Start coor: 1242905
 	- End coor: 1247000
 	- chr name: LinJ.34
Round 1 with Q2 = 0.13
Round 2 with Q2 = 0.01
Round 3 with Q2 = 1.67e-05
Round 4 with Q2 = 7.67e-14
Round 5 with Q2 = 9.78e-23
Round 6 with Q2 = 2.381e-41
Round 7 with Q2 = 2.075000443e-86
Round 8 with Q2 = 0.0
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.34_plus_1242905-1247000,LinJ.34,plus,99.690,2161,4096,1262712,1264649,0.0,3544.0,1938,4096,100,1852095,4,1,2,1936
3,Seq_LinJ.34_plus_1242905-1247000,LinJ.34,plus,98.923,2161,4096,1256825,1258774,0.0,3472.0,1950,4096,100,1852095,7,1,14,1936
2,Seq_LinJ.34_plus_1242905-1247000,LinJ.34,plus,98.974,2161,4096,1250938,1252887,0.0,3478.0,1950,4096,100,1852095,6,1,14,1936
4,Seq_LinJ.34_plus_1242905-1247000,LinJ.34,plus,93.712,2161,4090,1268612,1270546,0.0,2887.0,1956,4096,100,1852095,76,25,47,1930
5,Seq_LinJ.34_plus_1242905-1247000,LinJ.34,plus,93.514,2161,4090,1274552,1276488,0.0,2867.0,1958,4096,100,1852095,78,27,49,1930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,Seq_LinJ.34_plus_1242905-1247000,LinJ.36,plus,100.000,382,399,497704,497721,6.1,34.4,18,4096,11,2743073,0,0,0,18
588,Seq_LinJ.34_plus_1242905-1247000,LinJ.35,minus,100.000,380,397,108137,108120,6.1,34.4,18,4096,9,2019672,0,0,0,18
591,Seq_LinJ.34_plus_1242905-1247000,LinJ.35,minus,100.000,380,397,388247,388230,6.1,34.4,18,4096,9,2019672,0,0,0,18
592,Seq_LinJ.34_plus_1242905-1247000,LinJ.35,plus,100.000,381,398,400578,400595,6.1,34.4,18,4096,9,2019672,0,0,0,18


In [125]:
test_1513_df = gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1271, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations
test_1513_df["qlength"] = abs(test_1513_df["qstart"] - test_1513_df["qend"]) + 1
test_1513_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 1271 with length 1513
Creating folder ./test_no_filter/Seq_LinJ.30_plus_497129-498641_size-1513
	- Start coor: 497129
 	- End coor: 498641
 	- chr name: LinJ.30
Round 1 with Q2 = 0.62
Round 2 with Q2 = 2.82e-09
Round 3 with Q2 = 4.9450000000000005e-20
Round 4 with Q2 = 9.87e-29
Round 5 with Q2 = 7.580000000000001e-35
Round 6 with Q2 = 8.053710000000001e-47
Round 7 with Q2 = 7.9000000000168e-57
Round 8 with Q2 = 1.68e-68
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.30_plus_497129-498641,LinJ.30,plus,99.934,1,1513,490955,492467,0.000000e+00,2789.0,1513,1513,100,1353302,1,0,0,1513
15,Seq_LinJ.30_plus_497129-498641,LinJ.36,plus,71.011,409,1505,2499141,2500243,1.580000e-56,220.0,1128,1513,91,2743073,271,48,56,1097
76,Seq_LinJ.30_plus_497129-498641,LinJ.20,minus,72.201,104,603,711316,710814,9.870000e-29,128.0,518,1513,43,731244,111,23,33,500
75,Seq_LinJ.30_plus_497129-498641,LinJ.20,minus,72.201,104,603,680734,680232,9.870000e-29,128.0,518,1513,43,731244,111,23,33,500
25,Seq_LinJ.30_plus_497129-498641,LinJ.29,plus,74.468,984,1441,664773,665237,1.610000e-46,187.0,470,1513,42,1272419,103,15,17,458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,Seq_LinJ.30_plus_497129-498641,LinJ.10,plus,100.000,1001,1017,282557,282573,8.000000e+00,32.5,17,1513,3,588579,0,0,0,17
171,Seq_LinJ.30_plus_497129-498641,LinJ.04,plus,100.000,1426,1442,83114,83130,8.000000e+00,32.5,17,1513,5,466507,0,0,0,17
22,Seq_LinJ.30_plus_497129-498641,LinJ.36,minus,100.000,1486,1502,1318630,1318614,8.000000e+00,32.5,17,1513,91,2743073,0,0,0,17
92,Seq_LinJ.30_plus_497129-498641,LinJ.16,minus,100.000,1070,1086,584886,584870,8.000000e+00,32.5,17,1513,39,688184,0,0,0,17


In [126]:
test_5635_df = gff_viewer_repeated(main_df=long_seqs_df, 
                    slice_num=1708, 
                    path_folder=path_folder_test, 
                    path_genome=path_genome, 
                    repeat_num=10)  # wil do the none + 5 iterations
test_5635_df["qlength"] = abs(test_5635_df["qstart"] - test_5635_df["qend"]) + 1
test_5635_df.sort_values(by="qlength", ascending=False)

Analyzing sequence 1708 with length 5635
Creating folder ./test_no_filter/Seq_LinJ.34_plus_808220-813854_size-5635
	- Start coor: 808220
 	- End coor: 813854
 	- chr name: LinJ.34
Round 1 with Q2 = 0.64
Round 2 with Q2 = 3.83e-08
Round 3 with Q2 = 1.7400000000000003e-21
Round 4 with Q2 = 7.72e-55
Round 5 with Q2 = 1.4599999999999998e-146
Round 6 with Q2 = 5.1400000000000005e-161
Round 7 with Q2 = 0.0
Round 8 with Q2 = 0.0
Round 9 with Q2 = 0.0
Round 10 with Q2 = 0.0


Unnamed: 0,qseqid,sseqid,sstrand,pident,qstart,qend,sstart,send,evalue,bitscore,length,qlen,qcovs,slen,mismatch,gapopen,gaps,qlength
1,Seq_LinJ.34_plus_808220-813854,LinJ.34,plus,96.423,1,5635,810974,816621,0.0,9293.0,5675,5635,100,1852095,136,52,67,5635
3,Seq_LinJ.34_plus_808220-813854,LinJ.34,plus,92.948,1,5635,816495,822155,0.0,8198.0,5715,5635,100,1852095,269,108,134,5635
2,Seq_LinJ.34_plus_808220-813854,LinJ.34,plus,93.001,1,5635,813728,819388,0.0,8215.0,5715,5635,100,1852095,266,108,134,5635
4,Seq_LinJ.34_plus_808220-813854,LinJ.34,plus,92.913,1,5635,819262,824922,0.0,8187.0,5715,5635,100,1852095,271,108,134,5635
6,Seq_LinJ.34_plus_808220-813854,LinJ.34,plus,93.042,1,3090,822029,825134,0.0,4514.0,3133,5635,100,1852095,148,55,70,3090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,Seq_LinJ.34_plus_808220-813854,LinJ.25,plus,100.000,1575,1592,107516,107533,8.3,34.4,18,5635,6,895083,0,0,0,18
1039,Seq_LinJ.34_plus_808220-813854,LinJ.06,plus,100.000,3144,3161,326144,326161,8.3,34.4,18,5635,7,525234,0,0,0,18
1349,Seq_LinJ.34_plus_808220-813854,LinJ.27,minus,100.000,1806,1823,381405,381388,8.3,34.4,18,5635,8,1175412,0,0,0,18
1348,Seq_LinJ.34_plus_808220-813854,LinJ.27,minus,100.000,4560,4577,381405,381388,8.3,34.4,18,5635,8,1175412,0,0,0,18
