# Save/Retrieve workspace

In [1]:
path_workspace_db = "./save_workspace/4.Recatch_in_negative_filtered.pkl"

## Save

In [None]:
#  # To save the workspace
# import pickle

# # Example dictionary of variables to save
# variables_to_save = {
#     'var1': both_seqs,  # The list of SeqRecord objects
#     'var2': matches,  # The pandas DataFrame
#     'var3': neg_data,  # The pandas DataFrame
# }

# # Save the dictionary to a file
# with open(path_workspace_db, 'wb') as f:
#     pickle.dump(variables_to_save, f)

## Retrieve

In [2]:
# To retrieve the workspace
import pickle

# Load the saved variables
with open(path_workspace_db, 'rb') as f:
    loaded_variables = pickle.load(f)

# Retrieve your variables
both_seqs = loaded_variables['var1']
matches = loaded_variables['var2']
neg_data = loaded_variables['var3']

# Code

In [20]:
# needed modules
import subprocess
import os
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [21]:
# Read the negative dat from LmSIDER2A|B and Consensus
neg_data = pd.read_csv("./compare/consensus+LmSIDERs_vs_TP/negatives_testing_elements.csv", sep=",", header=0)
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(714, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,113760,114388,plus,CAGCGCCATGCACGACATGGCCGCTGACGTCCGTAGCCCTAACTCG...
1,LinJ.01,261866,262439,plus,CGGACTTGGCAAGTGGCCGCCATCGATGAAAACGCACCATGCCTTT...
2,LinJ.01,272496,275901,plus,GTCAGAACAACGAAGCCGCCCCCCATCACTGCCTCCCTCTACGTAC...
3,LinJ.02,95421,95743,plus,TCCGCGATCCGTGCAGTTGGCGCCGGCCCCTCCTTCACTGCCGATG...
4,LinJ.02,97368,97489,plus,TCGAGCTCAGCTCTGGGGCACGGTTTGGCGGGCGTGGAAGTGGGCC...


In [33]:
# # Now let's get the LmSIDER2A|B fasta
path_LmSIDER2A = "../0.Data/others/LmSIDER2a.fasta"
path_LmSIDER2B = "../0.Data/others/LmSIDER2b.fasta"
path_consensus = "./meme_motifs.fasta"
path_LmSIDER2A_and_B = "./data/LmSIDER2A+B+Consensus.fasta"  # output file

In [34]:
# Read both fastas and combine them into one file
seq1 = SeqIO.parse(path_LmSIDER2A, "fasta")
seq2 = SeqIO.parse(path_LmSIDER2B, "fasta")
seq3 = SeqIO.parse(path_consensus, "fasta")
both_seqs = list(seq1) + list(seq2) + list(seq3)
SeqIO.write(both_seqs, path_LmSIDER2A_and_B, "fasta")

5

In [35]:
# Get a fasta file form the negative data
# Define fasta_creator file
def fasta_creator(data, description_text, fasta_output_path):
    matrix = []
    for index, row in data.iterrows():
        # index += 1 # To start the index in 1
        rec = SeqRecord(Seq(row["sseq"]),  # In the 5 position is the seq
                        id=f"Seq_{index}_{row["sseqid"]}",
                        description=description_text
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

In [36]:
fasta_creator(data=neg_data,
              description_text="Linf NegData from LmSIDER2A|B and Consensus",
              fasta_output_path="./data/consensus+LmSIDER2A+B/negatives_testing_elements.fasta"
              )

In [37]:
# Prepare blastn function
# Remove e-value
def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]
    data[['pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']] = data[['pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']].apply(pd.to_numeric)
    return data

In [38]:
# Create a dictio with the negative data
path_neg_data = "./data/consensus+LmSIDER2A+B/neg_data_dict/negatives_testing_elements.fasta"
blastn_dic(path_input=path_neg_data, 
           path_output=path_neg_data
           )

In [40]:
matches = blastn_blaster(query_path="./data/LmSIDER2A+B+Consensus.fasta",
                         dict_path=path_neg_data,
                         perc_identity=60
                         )
print(matches.shape)
print(matches.dtypes)
matches.head()

(101, 12)
qseqid       object
sseqid       object
pident      float64
length        int64
qstart        int64
qend          int64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen          int64
slen          int64
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
0,LmSIDER2a,Seq_646_LinJ.35,90.476,42,3,42,266,225,1.08e-09,54.7,79,677
1,LmSIDER2a,Seq_681_LinJ.36,80.0,65,14,78,200,139,6.47e-07,45.4,79,610
2,LmSIDER2a,Seq_294_LinJ.25,78.571,70,1,68,1,70,6.47e-07,45.4,79,139
3,LmSIDER2a,Seq_400_LinJ.31,79.31,58,6,63,345,292,0.000108,38.1,79,669
4,LmSIDER2a,Seq_340_LinJ.28,78.689,61,14,74,15,72,0.000108,38.1,79,686


In [47]:
matches.to_csv("./data/consensus+LmSIDER2A+B/negatives_testing_elements_matches_with_LmSIDER2A+B+consensus.csv", sep=",", index=False, header=True)

In [46]:
matches["sseqid"].nunique()

69