In [None]:
#chroma api #import packages
from chroma import api
api.register_key("e424a1b4a1604a3a8cc83f0792dc3253")
from chroma import Protein, Chroma
import os
from Bio import PDB
import pandas as pd
import csv

In [None]:
#file pathes
design_out = './design_out/'
prediction_out = './prediction_out/'
output_file_path = "./output.tsv" 
tsv_file_path = "./output.tsv"

In [None]:
%%bash
#esmfold prediction

output_path="/Users/jianchengluo/Desktop/mycoplasma/prediction_out/"
file_path="protein.fna"
sequences=$(awk '/^[>]/ {if (seq) {print seq}; seq=""; next} /^[^>]/ {gsub(/\n/, ""); seq = seq $0} END {print seq}' "$file_path")

counter=1
while read -r sequence; do
  sequence_length=$(echo -n "$sequence" | wc -c)
  if [ "$sequence_length" -gt 400 ]; then
    echo "Skipping sequence $counter as its length is greater than 400"
    ((counter++))
    continue
  fi

  pdb_file="$output_path$counter.pdb"
  while true; do
    curl -X POST --data "$sequence" "https://api.esmatlas.com/foldSequence/v1/pdb/" > "$pdb_file"    
    if [ -z "$(cat "$pdb_file")" ] || grep -q "INTERNAL SERVER ERROR" "$pdb_file"; then
      echo "PDB file has INTERNAL SERVER ERROR or is empty, retrying..."
    else
      echo "PDB file generated successfully: $pdb_file"
      break
    fi
  done
  ((counter++))

done <<< "$sequences"


In [None]:
#design proteins

chroma = Chroma()
protein_names = [file for file in os.listdir(prediction_out) if file.endswith(".pdb")]

for p in protein_names:
    print(p)
    protein = Protein(prediction_out + p)
    protein = chroma.design(protein)
    protein.to(design_out + p)

In [None]:
#PDB to pro_sequence

def pdb_to_sequence(pdb_file_path):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)
    sequence = ""
    for model in structure:
        for chain in model:
            for residue in chain:
                try:
                    amino_acid = PDB.Polypeptide.three_to_one(residue.resname)
                except KeyError:
                    amino_acid = 'X'
                sequence += amino_acid    
    return sequence

df = pd.read_csv(output_file_path, delimiter='\t')
pdb_names = [file for file in os.listdir(design_out) if file.endswith(".pdb")]
pdb_names = sorted(pdb_names, key=lambda x: int(x.split('.')[0]))

df['pro_sequence'] = ""
protein_sequences = []


for pdb in pdb_names:
    protein_sequence = pdb_to_sequence(design_out + pdb)
    protein_sequences.append(protein_sequence)

for index, protein_sequence in enumerate(protein_sequences):
    df.at[index, 'pro_sequence'] = protein_sequence
    print(protein_sequence)

df.to_csv(output_file_path, sep='\t', index=False)


In [None]:
#protein_sequence to DNA sequence

def protein_to_dna(protein_sequence):
    genetic_code = {
        'A': ['GCT', 'GCC', 'GCA', 'GCG'],
        'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
        'N': ['AAT', 'AAC'],
        'D': ['GAT', 'GAC'],
        'C': ['TGT', 'TGC'],
        'Q': ['CAA', 'CAG'],
        'E': ['GAA', 'GAG'],
        'G': ['GGT', 'GGC', 'GGA', 'GGG'],
        'H': ['CAT', 'CAC'],
        'I': ['ATT', 'ATC', 'ATA'],
        'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
        'K': ['AAA', 'AAG'],
        'M': ['ATG'],
        'F': ['TTT', 'TTC'],
        'P': ['CCT', 'CCC', 'CCA', 'CCG'],
        'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
        'T': ['ACT', 'ACC', 'ACA', 'ACG'],
        'W': ['TGG'],
        'Y': ['TAT', 'TAC'],
        'V': ['GTT', 'GTC', 'GTA', 'GTG'],
        '*': ['TAA', 'TAG', 'TGA']
    }

    dna_sequence = ""
    for amino_acid in protein_sequence:
        codons = genetic_code.get(amino_acid, [''])
        chosen_codon = codons[0]
        dna_sequence += chosen_codon
    return dna_sequence

df = pd.read_csv(output_file_path, delimiter='\t')
protein_seqs = df['pro_sequence'].tolist()

df['sequence'] = ""

for index, protein_seq in enumerate(protein_seqs):
    dna_seq = protein_to_dna(protein_seq)
    df.at[index, 'sequence'] = dna_seq
    print(f"Protein Sequence: {protein_seq}")
    print(f"DNA Sequence: {dna_seq}")
    print(f"DNA Sequence written to {output_file_path}")

df.to_csv(output_file_path, sep='\t', index=False)



In [None]:
#change no \n
with open('./genome.txt', 'r') as file:
    content = file.read()

content_without_newlines = content.replace('\n', '')

with open('./genome.txt', 'w') as file:
    file.write(content_without_newlines)


In [None]:
#replace the original sequence

def replace_substring(original_string, target_substring, replacement_string):
    modified_string = original_string.replace(target_substring, replacement_string)
    return modified_string

#original_string
with open("./genome.txt", "r", encoding="utf-8") as file:
    genome = file.read()
original_string = genome

#target_substrings
file_path = "original_dna_sequence_list.txt"  
target_substrings = []
with open(file_path, 'r') as file:
    for line in file:
        if len(line) < 1200:
            target_substrings.append(line.rstrip())


#replacement_strings 
replacement_strings = []
with open(tsv_file_path, newline='') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    header = next(reader)
    sequence_index = header.index('sequence')
    for row in reader:
        replacement_strings.append(row[sequence_index])

#replacement
new_genome = original_string
for target_substring,replacement_string in zip(target_substrings,replacement_strings):
    new_genome = replace_substring(new_genome, target_substring, replacement_string)

with open("./new_genome.txt", "w", encoding="utf-8") as new_file:
    new_file.write(new_genome)


print("A New Life Is Born!")

In [None]:
# change 80 \n
with open('./new_genome.txt', 'r') as file:
    content2 = file.read()
content2_with_newlines = '\n'.join([content2[i:i+80] for i in range(0, len(content2), 80)])

with open('./new_genome.txt', 'w') as file:
    file.write(content2_with_newlines)


In [None]:
str1 = new_genome
str2 = genome
print(str1 == str2)