In [1]:
from Bio.Seq import Seq
from Bio.Data import CodonTable
from Bio.SeqUtils import six_frame_translations
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
import os

# Define the path to the input FASTA file

dir_r = "./data/raw/"
dir_r_cds ="./data/raw/CDS"
dir_r_aa = "./data/raw/AA"
dir_p = "./data/processed/"
dir_f = "./data/final/"




In [12]:
import csv
import pandas as pd 
import re

secretome_acc = os.path.join(dir_r,"secretome_acc.csv")

# Read the sequence names from the secretome CSV file
sequence_names_locus = []
with open(secretome_acc, "r") as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        # Extract the locus tag using a regular expression
        match = re.match(r'^([^\-]+)', row[0])
        locus_tag = match.group(1) if match else None
        if locus_tag:
            sequence_names_locus.append(locus_tag)
        

In [14]:
#First, extract locus tag from all CDS in Oryzae, then match those with the ones in the secretome and add to new fasta file


def extract_locus_tag_from_header(header):
    match = re.search(r'\[locus_tag=([^\]]+)]', header)
    return match.group(1) if match else None

secret_hits_cds = os.path.join(dir_p,"secret_hits_dna.fasta")
secret_hits_aa = os.path.join(dir_p,"secret_hits_aa.fasta")
                            
# Extract sequences from the FASTA file for DNA
cds_fasta = os.path.join(dir_r_cds, "aoryz_cds.fna")
sequences = []
with open(cds_fasta, "r") as file:
    fasta_data = file.read().split(">")[1:]
    for entry in fasta_data:
        entry_lines = entry.strip().split("\n")
        header = entry_lines[0]
        sequence = "".join(entry_lines[1:])
        sequence_name_locus = extract_locus_tag_from_header(header)
        if sequence_name_locus in sequence_names_locus:
            sequences.append(f">{header}\n{sequence}")

# Save the extracted AA sequences to a new FASTA file
with open(secret_hits_cds, "w") as file:
    file.write("\n".join(sequences))

#Take new CDS fasta file of all secretome hits and extract protein sequence from NCBI database
            
def extract_protein_id_from_header(header):
    match = re.search(r'\[protein_id=([^\]]+)]', header)
    return match.group(1) if match else None

# Extract sequences from the FASTA file for Translated Proteins
output_fa_file = os.path.join(dir_r_aa, "aoryz_aa.faa")
sequences_AA = []
with open(output_fa_file, "r") as file:
    fasta_data = file.read().split(">")[1:]
    for entry in fasta_data:
        entry_lines = entry.strip().split("\n")
        header = entry_lines[0]
        sequence = "".join(entry_lines[1:])
        sequence_name_protein = extract_protein_id_from_header(header)
        if sequence_name_protein  in sequence_names_protein :
            sequences_AA.append(f">{header}\n{sequence}")

# Save the extracted AA sequences to a new FASTA file
with open(secret_hits_aa, "w") as file:
    file.write("\n".join(sequences_AA))

In [15]:
sequences_AA

[]

In [None]:
# Extract sequences from the FASTA file for Translated Proteins
output_fa_file = os.path.join(dir_r_aa, "aoryz_aa.faa")
sequences_AA = []
with open(output_fa_file, "r") as file:
    fasta_data = file.read().split(">")[1:]
    for entry in fasta_data:
        entry_lines = entry.strip().split("\n")
        header = entry_lines[0]
        sequence = "".join(entry_lines[1:])
        sequence_name = header.split()[0]
        if sequence_name in sequence_names:
            sequences_AA.append(f">{header}\n{sequence}")

# Save the extracted AA sequences to a new FASTA file
with open(cds_hits_aa, "w") as file:
    file.write("\n".join(sequences_AA))

In [18]:
#Abels Idiotic Niger Knockouts

from Bio.Seq import Seq
from Bio.Data import CodonTable
from Bio.SeqUtils import six_frame_translations
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
import os

# Define the path to the input FASTA file

dir_r = "./data/raw/"
dir_r_cds ="./data/raw/CDS"
dir_r_aa = "./data/raw/AA"
dir_p = "./data/processed/"
dir_f = "./data/final/"

import subprocess

# Defining HMM and target to analyse

HMM = os.path.join(dir_p, "Lipase.hmm")
#Target = os.path.join(dir_r_aa, "anige_aa.faa")
Target = os.path.join(dir_r, "combined_sequences_aa.fasta")
output_hits_file = os.path.join(dir_f, "niger_lipases.csv")


command = ['hmmsearch', '-o', output_hits_file, HMM, Target]


# run the command and capture the output
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()