In [3]:
from Bio.Seq import Seq
from Bio.Data import CodonTable
from Bio.SeqUtils import six_frame_translations
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
import os

# Define the path to the input FASTA file

dir_r = "./data/raw/"
dir_r_cds ="./data/raw/CDS"
dir_r_aa = "./data/raw/AA"
dir_p = "./data/processed/"
dir_f = "./data/final/"

# Create an empty list to hold SeqRecord objects
combined_records = []

# Iterate through all files in the directory
for filename in os.listdir(dir_r_cds):
    if filename.endswith(".fna"):
        filepath = os.path.join(dir_r_cds, filename)
        # Read each file and append the sequences to the combined_records list
        with open(filepath, "r") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
            combined_records.extend(records)

# Create a combined fasta file with all sequences
fasta_file = os.path.join(dir_r, "combined_sequences_cds.fasta")
SeqIO.write(combined_records, fasta_file, "fasta")

93097

In [2]:
#AA Combined Sequence list

# Create an empty list to hold SeqRecord objects
combined_records = []

# Iterate through all files in the directory
for filename in os.listdir(dir_r_aa):
    if filename.endswith(".faa"):
        filepath = os.path.join(dir_r_aa, filename)
        # Read each file and append the sequences to the combined_records list
        with open(filepath, "r") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
            combined_records.extend(records)

# Create a combined fasta file with all sequences
fasta_file = os.path.join(dir_r, "combined_sequences_aa.fasta")
SeqIO.write(combined_records, fasta_file, "fasta")

93085

In [4]:
import subprocess

# Defining HMM and target to analyse

HMM = os.path.join(dir_p, "enzyme.hmm")
Target = os.path.join(dir_r, "combined_sequences_aa.fasta")
output_hits_file = os.path.join(dir_f, "enzymehits.csv")

command = ['hmmsearch', '-o', output_hits_file, HMM, Target]


# run the command and capture the output
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()

In [5]:
#Format data as slightly more readable csv file

import csv
import pandas as pd 
import re

output_hits_file = os.path.join(dir_f, "enzymehits.csv")
output_extr_file = os.path.join(dir_p, "extracted_data.csv")

# Open the output file and read its contents
with open(output_hits_file, "r") as file:
    csv_reader = csv.reader(file, delimiter="\t")
    rows = list(csv_reader)

# Extract headers from row 13
headers = ''.join(rows[13]).split()

# Extract data from rows 15 onwards until an empty row is encountered
data = []
row_index = 15
while row_index < len(rows) and rows[row_index]:
    data.append(''.join(rows[row_index]).split())
    row_index += 1

# Write the extracted data to a new CSV file
with open(output_extr_file, "w", newline="") as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(headers)  # Write headers
    csv_writer.writerows(data)  # Write data rows





In [6]:
#Extraction of hits from CDS data based on names of top rankers in HMM search

def extract_protein_id_from_header(header):
    match = re.search(r'\[protein_id=([^\]]+)]', header)
    return match.group(1) if match else None

cds_hits_aa = os.path.join(dir_p, "CDS_hits_aa.fasta")

# Read the sequence names from the CSV file with ranking
sequence_names = []
with open(output_extr_file, "r") as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if len(row) > 8:  # Ensure that the row has at least 9 elements
            sequence_names.append(row[8])
        else:
            print(f"Issue with row: {row}")

# Extract sequences from the FASTA file for Translated Proteins
output_fa_file = os.path.join(dir_r, "combined_sequences_aa.fasta")
sequences_AA = []

# Iterate over the sequence names in the order of ranking
for sequence_name in sequence_names:
    with open(output_fa_file, "r") as file:
        fasta_data = file.read().split(">")[1:]
        for entry in fasta_data:
            entry_lines = entry.strip().split("\n")
            header = entry_lines[0]
            sequence = "".join(entry_lines[1:])
            current_sequence_name = header.split()[0]
            if current_sequence_name == sequence_name:
                sequences_AA.append(f">{header}\n{sequence}")
                break  # Break once the sequence is found

# Save the extracted AA sequences to a new FASTA file
with open(cds_hits_aa, "w") as file:
    file.write("\n".join(sequences_AA))


Issue with row: ['------', 'inclusion', 'threshold', '------']


In [5]:
from Bio import SeqIO
from Bio import pairwise2
import os

def calculate_identity(seq1, seq2):
    alignments = pairwise2.align.localxx(seq1, seq2, one_alignment_only=True)
    if not alignments:
        return 0.0  # No alignment found, consider them different
    best_alignment = alignments[0]
    identity = (best_alignment[2] / max(len(seq1), len(seq2))) * 100
    return identity

def filter_sequences(fasta_file, identity_threshold):
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    filtered_sequences = []

    for i, seq1 in enumerate(sequences):
        is_unique = True
        for j, seq2 in enumerate(sequences):
            if i != j:
                identity = calculate_identity(seq1.seq, seq2.seq)
                if identity >= identity_threshold:
                    # Compare sequence identifiers
                    if seq1.id > seq2.id: #If Seq1 is a worse hit than Seq2, e.g. rather take Seq2 to filtered_sequences in later iteration
                        is_unique = False
                        break

        if is_unique:
            filtered_sequences.append(seq1)

    return filtered_sequences

if __name__ == "__main__":
    input_fasta_file = os.path.join(dir_p,"CDS_hits_aa.fasta")
    output_fasta_file = os.path.join(dir_f,"filtered_hits_aa.fasta")
    identity_threshold = 80

    filtered_sequences = filter_sequences(input_fasta_file, identity_threshold)

    SeqIO.write(filtered_sequences, output_fasta_file, "fasta")

    num_input_sequences = len(list(SeqIO.parse(input_fasta_file, "fasta")))
    num_removed_sequences = num_input_sequences - len(filtered_sequences)
    print(f"{num_removed_sequences} protein sequences were removed.")



KeyboardInterrupt: 

In [10]:
from Bio import SeqIO
from Bio import pairwise2
import os

def calculate_identity(seq1, seq2):
    alignments = pairwise2.align.localxx(seq1, seq2, one_alignment_only=True)
    if not alignments:
        return 0.0  # No alignment found, consider them different
    best_alignment = alignments[0]
    identity = (best_alignment[2] / max(len(seq1), len(seq2))) * 100
    return identity

def filter_sequences(fasta_file, identity_threshold):
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    filtered_sequences = []

    fnon_unique_indices = set()

    for i, seq1 in enumerate(sequences):
    # Skip sequences already deemed non-unique
        if i in non_unique_indices:
            continue

        is_unique = True

        for j, seq2 in enumerate(sequences):
            if i != j and j not in non_unique_indices:  # Skip comparisons with self and already non-unique sequences
                identity = calculate_identity(seq1.seq, seq2.seq)

                if identity >= identity_threshold:
                    # Compare sequence identifiers
                    if seq1.id > seq2.id: #If Seq1 is a worse hit than Seq2
                        is_unique = False
                        non_unique_indices.add(i)  # Mark seq2 as non-unique
                    else: #If Seq2 is a worse hit than Seq1
                        non_unique_indices.add(j)  # Mark seq1 as non-unique
                        break  # No need to continue comparing seq1 with other sequences

        if is_unique:
            filtered_sequences.append(seq1)


    return filtered_sequences

if __name__ == "__main__":
    input_fasta_file = os.path.join(dir_p,"CDS_hits_aa.fasta")
    output_fasta_file = os.path.join(dir_f,"filtered_hits_aa.fasta")
    identity_threshold = 80
    non_unique_indices = set()

    filtered_sequences = filter_sequences(input_fasta_file, identity_threshold)

    SeqIO.write(filtered_sequences, output_fasta_file, "fasta")

    num_input_sequences = len(list(SeqIO.parse(input_fasta_file, "fasta")))
    num_removed_sequences = num_input_sequences - len(filtered_sequences)
    print(f"{num_removed_sequences} protein sequences were removed.")

71 protein sequences were removed.


In [10]:
# Extract sequences from the filtered FASTA file, but get the DNA sequences

import os
import re
from Bio import SeqIO

# Function to extract protein ID from header with variable structure
def extract_protein_id_from_aa_header(header):
    match = re.search(r'(\S+)', header.split()[0])
    return match.group(1) if match else None

def extract_protein_id_from_cds_header(header):
    match = re.search(r'\[protein_id=([^\]]+)]', header)
    return match.group(1) if match else None

# Define file paths

#input_fasta_file = os.path.join(dir_f,"filtered_hits_aa.fasta")#test
input_fasta_file = os.path.join(dir_f,"240221_4amidases.fasta") #real file

fasta_file = os.path.join(dir_r, "combined_sequences_cds.fasta") 

#output_fasta_file = os.path.join(dir_f,"filtered_hits_dna.fasta") #real file
output_fasta_file = os.path.join(dir_f,"240221_4amidases_cds.fasta") # test

# Create a set of protein IDs from the AA filtered hits fasta file
protein_ids_to_extract = set()
for record in SeqIO.parse(input_fasta_file, "fasta"):
    protein_id = extract_protein_id_from_aa_header(record.description)
    if protein_id:
        protein_ids_to_extract.add(protein_id)

# Extract nucleotide sequences from the giant CDS.fasta file
with open(output_fasta_file, "w") as output_fasta:
    for record in SeqIO.parse(fasta_file, "fasta"):
        protein_id = extract_protein_id_from_cds_header(record.description)
        if protein_id and protein_id in protein_ids_to_extract:
            SeqIO.write(record, output_fasta, "fasta")

print(f"Extracted {len(protein_ids_to_extract)} nucleotide sequences to {output_fasta_file}.")


Extracted 4 nucleotide sequences to ./data/final/240221_4amidases_cds.fasta.


In [12]:
from Bio import SeqIO

def count_entries(fasta_file):
    return sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))

# Example usage
fasta_file_path = os.path.join(dir_f, "filtered_hits_dna.fasta")
num_entries = count_entries(fasta_file_path)
print(f"Number of entries in {fasta_file_path}: {num_entries}")



Number of entries in ./data/final/filtered_hits_dna.fasta: 106


In [1]:
#Following code is to help evaluate the GFF3 results from SP06

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
from typing import List, Dict, Union
from teemi.design.combinatorial_design import simple_amplicon_maker
from pydna.design import primer_design
from pydna.dseqrecord import Dseqrecord


def open_gff3_files(path: str = "") -> List[List[str]]:
    """
    Opens and reads a GFF3 file and returns its contents as a list of lists.

    Parameters:
    -----------
    path: str
        The path to the GFF3 file.

    Returns:
    --------
    List[List[str]]
        A list of lists containing the contents of the GFF3 file.
    """
    with open(path, "r") as infile:
        LINES = []
        for line in infile:
            LINES.append(line[:].split("\t"))
        LINES = LINES[1:]

    return LINES

import re

def tidy_up_gff(lst_of_gff: list) -> list:
    """
    This function takes a list of GFF lines and returns a list of dictionaries,
    with each dictionary containing information on the signal peptides in the GFF file.

    Parameters:
    lst_of_gff (list): A list of GFF lines.

    Returns:
    list_of_peptides (list): A list of dictionaries, with each dictionary containing information on the signal peptides in the GFF file.
    """
    list_of_peptides = []

    for peptide in lst_of_gff:
        # Splitting the gene attribute to extract the first protein name
        gene_attribute_parts = peptide[0].split()
        first_protein_name = gene_attribute_parts[0]

        signal_peptides = {
            "gene": first_protein_name,
            "start_pos": int(peptide[3]) - 1,
            "end_pos": int(peptide[4]) + 1,
            "signal_peptide_likelyhood": peptide[5],
        }

        list_of_peptides.append(signal_peptides)

    return list_of_peptides


def dict_of_signal_peptides(path: str = "") -> List[Dict[str, Union[str, int]]]:
    """
    Given a path to a GFF3 file, returns a list of dictionaries with information on signal peptides.

    Args:
        path (str): Path to the GFF3 file. Default is an empty string.

    Returns:
        list: A list of dictionaries where each dictionary contains the following keys:
            - 'gene' (str): Gene name of the signal peptide.
            - 'start_pos' (int): Start position of the signal peptide in the protein sequence.
            - 'end_pos' (int): End position of the signal peptide in the protein sequence.
            - 'signal_peptide_likelyhood' (str): The likelihood of the sequence being a signal peptide.
    """
    gff = open_gff3_files(path)
    dict_of_signal_peptides = tidy_up_gff(gff)
    return dict_of_signal_peptides


def read_gff_to_pd(path: str = "") -> pd.DataFrame:
    """
    Reads a GFF3 file and returns a pandas DataFrame with columns 'gene', 'start_pos', 'end_pos',
    and 'signal_peptide_likelyhood'.

    Parameters:
    -----------
    path : str
        The path to the GFF3 file.

    Returns:
    --------
    df : pandas.DataFrame
        A DataFrame with columns 'gene', 'start_pos', 'end_pos', and 'signal_peptide_likelyhood'.
    """

    gff = open_gff3_files(path)
    dict_of_signal_peptides = tidy_up_gff(gff)
    df = pd.DataFrame.from_records(dict_of_signal_peptides)

    return df

import requests
import json

def primer_ta_neb(primer1, primer2, conc=0.4, prodcode="phusion-0"):
    """Calculates primer pair melting temp TA,  from NEB.

    Parameters
    ----------
    primer1 : str
        first primer to be used for finding the optimal ta
    primer2 : str
        second primer to be used for finding the optimal ta
    conc : float
    prodcode : str
        find product codes on nebswebsite: https://tmapi.neb.com/docs/productcodes

    Returns
    -------
    ta : int
        primer pair annealing temp

    """

    url = "https://tmapi.neb.com/tm/batch"
    seqpairs = [[primer1, primer2]]

    input = {"seqpairs": seqpairs, "conc": conc, "prodcode": prodcode}
    headers = {"content-type": "application/json"}
    res = requests.post(url, data=json.dumps(input), headers=headers)

    r = json.loads(res.content)

    if r["success"]:
        for row in r["data"]:
            return row["ta"]

    else:
        print("request failed")
        print(r["error"][0])


def primer_tm_neb1(primer, conc=0.4, prodcode="phusion-0"):
    """Calculates a single primers melting temp from NEB.

    Parameters
    ----------
    primer1 : str
    conc : float
    prodcode : str
        find product codes on nebswebsite: https://tmapi.neb.com/docs/productcodes

    Returns
    -------
    tm : int
        primer melting temperature

    """

    url = "https://tmapi.neb.com/tm/batch"
    seqpairs = [[primer]]

    input = {"seqpairs": seqpairs, "conc": conc, "prodcode": prodcode}
    headers = {"content-type": "application/json"}
    res = requests.post(url, data=json.dumps(input), headers=headers)

    r = json.loads(res.content)

    if r["success"]:
        for row in r["data"]:
            return row["tm1"]
    else:
        print("request failed")
        print(r["error"][0])
        


def make_amplicons(
    list_of_amplicons: list, target_tm=58, limit=10, tm_function=primer_tm_neb1,
    forward_overhang="TTTTTTTTTT", reverse_overhang="CCCCCCCC" ):
    """Generates pydna.amplicons which contains primers with a target temperature.

    Parameters
    ----------
    list_of_amplicons : list
        list of pydna.Dseqrecords
    target_tm : int
        representing the target melting temperature for the primers (default=55)
    limit: int
        representing the minimum primer size (default=5)
    tm_function : function
        for calculating primer melting temperature (default=primer_tm_neb)

    forward_overhang : str
        forward overhang sequence (default="TTTTTTTTTT")
    reverse_overhang : str
        reverse overhang sequence (default="CCCCCCCC")

    Returns:
    amplicons: list
        list of amplicon objects with designed primer sequences
    """
    amplicons = []
    for i in range(len(list_of_amplicons)):
        amplicon = primer_design(
            list_of_amplicons[i],
            target_tm=target_tm,
            limit=limit,
            tm_function=tm_function,

            f_overhang=forward_overhang,
            r_overhang=reverse_overhang,
        )

        amplicons.append(amplicon)

    return amplicons




In [7]:
#Defining the Nucleotide sequences that correspond to the .gff3 summary file
sequences = list()
fasta_file_path = os.path.join(dir_f, "240221_4amidases_cds.fasta")
for sequence in SeqIO.parse(fasta_file_path,format='fasta'):
    print(sequence)
    sequences.append(sequence)
    
#Reads .gff3 generated by signalp06 and the corresponding amino acid sequence fasta file (Code by Lucas Levassor)
from Bio import SeqIO

signal_pep = read_gff_to_pd(os.path.join(dir_f,'240221_4amidasesgff3.gff3'))
signal_pep

N_pos = signal_pep['end_pos'].to_list()
genes_with_signal_peptides = signal_pep['gene'].to_list()

my_dict = {'name':genes_with_signal_peptides, 'n_pos':N_pos }
my_dict



ID: lcl|KV907505.1_cds_OOF92982.1_7994
Name: lcl|KV907505.1_cds_OOF92982.1_7994
Description: lcl|KV907505.1_cds_OOF92982.1_7994 [locus_tag=ASPCADRAFT_508787] [db_xref=InterPro:IPR000120,JGIDB:Aspca3_508787] [protein=hypothetical protein] [protein_id=OOF92982.1] [location=185392..187512] [gbkey=CDS]
Number of features: 0
Seq('ATGGCATGGCTGTTTCCTCTCGTTTTCATTATGAGCCTCGTTGGGACACCTGCT...TGA')
ID: lcl|KV907515.1_cds_OOF90661.1_10513
Name: lcl|KV907515.1_cds_OOF90661.1_10513
Description: lcl|KV907515.1_cds_OOF90661.1_10513 [locus_tag=ASPCADRAFT_58797] [db_xref=InterPro:IPR000120,JGIDB:Aspca3_58797] [protein=hypothetical protein] [protein_id=OOF90661.1] [location=complement(join(119270..119769,119829..121236))] [gbkey=CDS]
Number of features: 0
Seq('ATGGTGCGTCTGGCCCAGCTCGCCGTGTCGGCTCTCGGGCTGGTCGGCACAGCG...TGA')
ID: lcl|NC_036441.1_cds_XP_001826005.1_10339
Name: lcl|NC_036441.1_cds_XP_001826005.1_10339
Description: lcl|NC_036441.1_cds_XP_001826005.1_10339 [locus_tag=AO090011000357] [db_xref=Gene

{'name': ['OOF90661.1', 'XP_001826005.1', 'OOF92982.1'], 'n_pos': [21, 28, 23]}

In [8]:
#Truncation of coding sequences for correct primer generation

clean_seq = list()

for i in range(len(genes_with_signal_peptides)):
    gene_id = genes_with_signal_peptides[i]
    for seq in sequences:
        if gene_id in seq.id:
            clean_seq.append(seq[(N_pos[i]+1) * 3:-3]) #Removes one amino acid extra to accomodate for peptidase cleavage site leftovers
            break  # Break the loop if a match is found

# Add sequences that didn't match
for seq in sequences:
   if all(gene_id not in seq.description for gene_id in genes_with_signal_peptides):
        clean_seq.append(seq[3:-3]) #still needs to remove ATG to accomodate signal peptide ATG instead. Also needs to remove stop codon.

clean_seq

[SeqRecord(seq=Seq('GTGTCTACGGGCATGTCCGTCACGCTGGATAACATCAACTACTTCATCTCGCCC...TTT'), id='lcl|KV907515.1_cds_OOF90661.1_10513', name='lcl|KV907515.1_cds_OOF90661.1_10513', description='lcl|KV907515.1_cds_OOF90661.1_10513 [locus_tag=ASPCADRAFT_58797] [db_xref=InterPro:IPR000120,JGIDB:Aspca3_58797] [protein=hypothetical protein] [protein_id=OOF90661.1] [location=complement(join(119270..119769,119829..121236))] [gbkey=CDS]', dbxrefs=[]),
 SeqRecord(seq=Seq('GATAGTTTAGTTGCATCTCAGGTTGTGACAAATCCCTATGAATATGATTTCCCT...ATG'), id='lcl|NC_036441.1_cds_XP_001826005.1_10339', name='lcl|NC_036441.1_cds_XP_001826005.1_10339', description='lcl|NC_036441.1_cds_XP_001826005.1_10339 [locus_tag=AO090011000357] [db_xref=GeneID:5998108] [protein_id=XP_001826005.1] [location=join(914179..914492,914550..916008)] [gbkey=CDS]', dbxrefs=[]),
 SeqRecord(seq=Seq('CTGGAGATGCAGAACATGGACGTCACCCTGGAAAGAGGCATCACATTCGACCTG...TTT'), id='lcl|KV907505.1_cds_OOF92982.1_7994', name='lcl|KV907505.1_cds_OOF92982.1_7994', descrip

In [13]:
# Get the sequence lengths
seq_lengths = [len(seq) for seq in clean_seq]

# Print or use seq_lengths as needed
print(seq_lengths)

[1839, 1683, 2046, 1349]


In [9]:
clean_seq = [Dseqrecord(seq) for seq in clean_seq]

amplicons = make_amplicons(clean_seq,
                           target_tm=62, # target temp
                           limit=18,  # min length of primers
                           tm_function = primer_tm_neb1)

In [10]:
forward_primer = [str(f.forward_primer.seq) for f in amplicons]
r_primer = [str(r.reverse_primer.seq) for r in amplicons]
name = [str(r.name) for r in amplicons]
gene = [str(r.id) for r in clean_seq]
aneal_f = [primer_tm_neb1(str(r)) for r in forward_primer]
aneal_r = [primer_tm_neb1(str(r)) for r in r_primer]

# Add forward and reverse overhangs
forward_overhang = "CTGAGCGGCCTCGTCTGCACAGGGTTGGCAAAT"
reverse_overhang = "CTTGCTCACCATGGACTGGAAGTAGAGGTTCTC"
forward_primer_with_overhang = [forward_overhang + f for f in forward_primer]
reverse_primer_with_overhang = [reverse_overhang + r for r in r_primer]

#Calculates Ta

ta= [primer_ta_neb(str(f.forward_primer.seq),str(f.reverse_primer.seq))  for f in amplicons]


In [11]:

df = pd.DataFrame({
    'template': gene,
    'f_primer': forward_primer_with_overhang,
    'r_primer': reverse_primer_with_overhang,
    'f_tm': aneal_f,
    'r_tm': aneal_r,
    'ta': ta
})

# Calculate and add columns for primer lengths
df['len_fw'] = df['f_primer'].apply(lambda x: len(x))
df['len_rv'] = df['r_primer'].apply(lambda x: len(x))

print(df)


                                   template  \
0       lcl|KV907515.1_cds_OOF90661.1_10513   
1  lcl|NC_036441.1_cds_XP_001826005.1_10339   
2        lcl|KV907505.1_cds_OOF92982.1_7994   
3              NC_036440.1:c3236563-3235134   

                                            f_primer  \
0  CTGAGCGGCCTCGTCTGCACAGGGTTGGCAAATGTGTCTACGGGCA...   
1  CTGAGCGGCCTCGTCTGCACAGGGTTGGCAAATGATAGTTTAGTTG...   
2  CTGAGCGGCCTCGTCTGCACAGGGTTGGCAAATCTGGAGATGCAGA...   
3  CTGAGCGGCCTCGTCTGCACAGGGTTGGCAAATCCATCTGCCAGCT...   

                                            r_primer  f_tm  r_tm  ta  len_fw  \
0  CTTGCTCACCATGGACTGGAAGTAGAGGTTCTCAAACAAGACTGTT...    59    62  62      51   
1  CTTGCTCACCATGGACTGGAAGTAGAGGTTCTCCATAATCACAGGG...    59    58  61      58   
2  CTTGCTCACCATGGACTGGAAGTAGAGGTTCTCAAAAGCCGTTCTC...    59    61  62      53   
3  CTTGCTCACCATGGACTGGAAGTAGAGGTTCTCCGGCAAATGCATG...    61    62  65      51   

   len_rv  
0      56  
1      53  
2      51  
3      54  


In [12]:
import re

truncated_gene = []

for template in gene:
    # Try to extract protein name using regular expression
    match = re.search(r'cds_([^_]+)[._]([^_]+)[._]\d+', template)
    
    if match:
        truncated_gene.append(f"{match.group(1)}_{match.group(2)}")
    else:
        # If the regular expression doesn't match, take the first 15 characters
        truncated_gene.append(template[:20])

print(truncated_gene)


['OOF90661_1', 'XP_001826005.1', 'OOF92982_1', 'NC_036440.1:c3236563']


In [154]:
gene_names_fw = [name+'_fw' for name in truncated_gene]
gene_names_rv = [name+'_rv' for name in truncated_gene]

In [2]:
forward_df = pd.DataFrame({'Name': gene_names_fw, 'Sequence':forward_primer_with_overhang, 'Concentration':'25nm', 'Purification': 'STD'})
reverse_df = pd.DataFrame({'Name': gene_names_rv, 'Sequence':reverse_primer_with_overhang, 'Concentration':'25nm', 'Purification': 'STD'})

idt_primers_result = pd.concat([forward_df, reverse_df],ignore_index=True)
idt_primers_result

output_excel_file = os.path.join(dir_f, "IDT_primer.xlsx")
idt_primers_result.to_excel(output_excel_file, index=False) 


NameError: name 'pd' is not defined

NameError: name 'ææ' is not defined