# Code préservation

In [None]:
pip install Bio

In [None]:
import re
import csv
from Bio import SeqIO
from Bio.Seq import Seq

#Genetic code, mitochondrial or mtalt
mitalt_genetic_code = {
    "TTT" : "F", "TTC" : "F", "TTA" : "L", "TTG" : "L",
    "TCT" : "S", "TCC" : "S", "TCA" : "S", "TCG" : "S",
    "TAT" : "Y", "TAC" : "Y", "TAA" : "*", "TAG" : "*",
    "TGT" : "C", "TGC" : "C", "TGA" : "W", "TGG" : "W",
    "CTT" : "L", "CTC" : "L", "CTA" : "L", "CTG" : "L",
    "CCT" : "P", "CCC" : "P", "CCA" : "P", "CCG" : "P",
    "CAT" : "H", "CAC" : "H", "CAA" : "Q", "CAG" : "Q",
    "CGT" : "R", "CGC" : "R", "CGA" : "R", "CGG" : "R",
    "ATT" : "I", "ATC" : "I", "ATA" : "I", "ATG" : "M",
    "ACT" : "T", "ACC" : "T", "ACA" : "T", "ACG" : "T",
    "AAT" : "N", "AAC" : "N", "AAA" : "K", "AAG" : "K",
    "AGT" : "S", "AGC" : "S", "AGA" : "R", "AGG" : "R",
    "GTT" : "V", "GTC" : "V", "GTA" : "V", "GTG" : "V",
    "GCT" : "A", "GCC" : "A", "GCA" : "A", "GCG" : "A",
    "GAT" : "D", "GAC" : "D", "GAA" : "E", "GAG" : "E",
    "GGT" : "G", "GGC" : "G", "GGA" : "G", "GGG" : "G"
 }

nuclear_genetic_code = {
    "TTT" : "F", "TTC" : "F", "TTA" : "L", "TTG" : "L",
    "TCT" : "S", "TCC" : "S", "TCA" : "S", "TCG" : "S",
    "TAT" : "Y", "TAC" : "Y", "TAA" : "*", "TAG" : "*",
    "TGT" : "C", "TGC" : "C", "TGA" : "*", "TGG" : "W",
    "CTT" : "L", "CTC" : "L", "CTA" : "L", "CTG" : "L",
    "CCT" : "P", "CCC" : "P", "CCA" : "P", "CCG" : "P",
    "CAT" : "H", "CAC" : "H", "CAA" : "Q", "CAG" : "Q",
    "CGT" : "R", "CGC" : "R", "CGA" : "R", "CGG" : "R",
    "ATT" : "I", "ATC" : "I", "ATA" : "I", "ATG" : "M",
    "ACT" : "T", "ACC" : "T", "ACA" : "T", "ACG" : "T",
    "AAT" : "N", "AAC" : "N", "AAA" : "K", "AAG" : "K",
    "AGT" : "S", "AGC" : "S", "AGA" : "R", "AGG" : "R",
    "GTT" : "V", "GTC" : "V", "GTA" : "V", "GTG" : "V",
    "GCT" : "A", "GCC" : "A", "GCA" : "A", "GCG" : "A",
    "GAT" : "D", "GAC" : "D", "GAA" : "E", "GAG" : "E",
    "GGT" : "G", "GGC" : "G", "GGA" : "G", "GGG" : "G"
    }

mitochondrial_genetic_code = {
    "TTT" : "F", "TTC" : "F", "TTA" : "L", "TTG" : "L",
    "TCT" : "S", "TCC" : "S", "TCA" : "S", "TCG" : "S",
    "TAT" : "Y", "TAC" : "Y", "TAA" : "*", "TAG" : "*",
    "TGT" : "C", "TGC" : "C", "TGA" : "W", "TGG" : "W",
    "CTT" : "L", "CTC" : "L", "CTA" : "L", "CTG" : "L",
    "CCT" : "P", "CCC" : "P", "CCA" : "P", "CCG" : "P",
    "CAT" : "H", "CAC" : "H", "CAA" : "Q", "CAG" : "Q",
    "CGT" : "R", "CGC" : "R", "CGA" : "R", "CGG" : "R",
    "ATT" : "I", "ATC" : "I", "ATA" : "I", "ATG" : "M",
    "ACT" : "T", "ACC" : "T", "ACA" : "T", "ACG" : "T",
    "AAT" : "N", "AAC" : "N", "AAA" : "K", "AAG" : "K",
    "AGT" : "S", "AGC" : "S", "AGA" : "*", "AGG" : "*",
    "GTT" : "V", "GTC" : "V", "GTA" : "V", "GTG" : "V",
    "GCT" : "A", "GCC" : "A", "GCA" : "A", "GCG" : "A",
    "GAT" : "D", "GAC" : "D", "GAA" : "E", "GAG" : "E",
    "GGT" : "G", "GGC" : "G", "GGA" : "G", "GGG" : "G"
 }

#Store variants
class Var:
    def __init__(self, index, locus, position, init, final, overall_frequency, frequency_L, frequency_M, frequency_N):
        self.index = int(index)
        self.locus = locus
        self.position = int(position)
        self.init = init
        self.final = final
        self.overall_frequency = float(overall_frequency)
        self.frequency_L = float(frequency_L)
        self.frequency_M = float(frequency_M)
        self.frequency_N = float(frequency_N)

#Store ORF
class ORF:
    def __init__(self, sequence, start, end, expected_protein, genetic_code, name):
        self.sequence_ADN_initiale = sequence
        self.sequence_ADN = sequence
        self.start = start
        self.end = end
        self.variants = []
        self.genetic_code = genetic_code
        self.preservation_overall = 100
        self.preservation_N = 100
        self.preservation_M = 100
        self.preservation_L = 100
        self.expected_protein = expected_protein
        self.length = len(expected_protein) #Initial protein length
        self.name = name
        self.destructive_variants = []  #Destructive variants

    def translate_sequence(self, sequence, strict=False):
        protein = ""
        start = 0

        if not strict:
            for i in range(0, len(sequence) - 2, 3):
                codon = sequence[i:i+3]
                if codon in ["ATG", "ATA", "ATT"]:
                    start = i
                    break

        first = True
        for i in range(start, len(sequence) - 2, 3):
            codon = sequence[i:i+3]

            # Check if it's the first codon
            if first:
                if codon in ["ATG", "ATA", "ATT"]:
                    aa = "M"
                else:
                    aa = self.genetic_code.get(codon, "?")
                first = False
            else:
                aa = self.genetic_code.get(codon, "?")

            protein += aa
            if aa == "*":
                break

        return protein


    def apply_variants(self, variants_list):
        sequence_list = list(self.sequence_ADN_initiale)  # Start from intial protein
        initial_protein = self.expected_protein
        initial_length = self.length

        self.variants = []
        self.destructive_variants = []

        for var in variants_list:
          if not is_variant_applicable_to_orf(self, var):
              continue

          variant_pos = var.position - self.start
          sequence_list[variant_pos] = var.final
          self.variants.append(var)

          # Is the variant destructive?
          temp_seq = list(self.sequence_ADN_initiale)
          temp_seq[variant_pos] = var.final
          mutated_temp_protein = self.translate_sequence("".join(temp_seq), strict=False)

          if len(mutated_temp_protein) != initial_length:
              self.destructive_variants.append(var)
              self.preservation_overall -= var.overall_frequency
              self.preservation_L -= var.frequency_L
              self.preservation_M -= var.frequency_M
              self.preservation_N -= var.frequency_N


        # Update mutated protein
        self.sequence_ADN = "".join(sequence_list)
        mutated_protein = self.translate_sequence(self.sequence_ADN, strict=False)

        return mutated_protein

def is_variant_applicable_to_orf(orf, var):
    relative_pos = var.position - orf.start
    if not (orf.start <= var.position <= orf.end):
        return False
    if not (0 <= relative_pos < len(orf.sequence_ADN_initiale)):
        return False
    if orf.sequence_ADN_initiale[relative_pos] != var.init:
        return False
    return True

# Mitochondrial genome
def get_mitochondrial_dna(filename="/content/human_mitochondrial_genome.fasta"):
    with open(filename) as handle:
        record = SeqIO.read(handle, "fasta")
        return record.seq

# Read fasta with ORFs
def read_fasta_proteins(fasta_file, genome_seq, genetic_code):
    orfs = []
    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            header = record.description
            match = re.search(r"start: (\d+) end: (\d+)", header)
            if match:
                start, end = map(int, match.groups())
                expected_protein = str(record.seq)
                dna_seq = genome_seq[start:end]

                orf = ORF(dna_seq, start, end, expected_protein, 3, genetic_code, record.id)
                orfs.append(orf)

    return orfs

# Variants
def read_variants_file(variant_file):
    variants_list = []
    with open(variant_file, newline='') as csvfile:
        variants = csv.reader(csvfile, delimiter=';', quotechar='"')
        next(variants)
        for row in variants:
            index, locus, position, init, final, overall_frequency, frequency_L, frequency_M, frequency_N = row
            variants_list.append(Var(index, locus, position, init, final, overall_frequency, frequency_L, frequency_M, frequency_N))
    return variants_list

# Save in a csv
def save_results_to_csv(orfs, filename="orf_analysis_results_mtaltco1_overall.csv"):
    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file, delimiter=";")
        writer.writerow(["Nom ORF", "Start", "End", "ADN initial",
                         "Protéine initiale", "Variants appliqués",
                         "Variants destructeurs", "Preservation Score Overall", "Preservation Score Haplogroup L", "Preservation Score Haplogroup M", "Preservation Score Haplogroup N"])

        for orf in orfs:
            mutated_protein = orf.translate_sequence(orf.sequence_ADN, strict=True)
            applied_variants = ";".join([f"{v.position}:{v.init}>{v.final} ({v.overall_frequency}, {v.frequency_L}, {v.frequency_M}, {v.frequency_N})" for v in orf.variants])
            destructive_variants = ";".join([f"{v.position}:{v.init}>{v.final} ({v.overall_frequency}, {v.frequency_L}, {v.frequency_M}, {v.frequency_N})" for v in orf.destructive_variants])
            writer.writerow([orf.name, orf.start, orf.end, orf.sequence_ADN_initiale,
                             orf.expected_protein, applied_variants, destructive_variants, f"{orf.preservation_overall:.6f}", f"{orf.preservation_L:.6f}", f"{orf.preservation_M:.6f}", f"{orf.preservation_N:.6f}"])

# Save the log
def save_log(orfs, variants_list, log_filename="orf_analysis_log_overall.txt"):
    with open(log_filename, "w") as log_file:
        for orf in orfs:
            log_file.write(f"===== ORF: {orf.name} (start: {orf.start}, end: {orf.end}) =====\n")
            initial_protein = orf.translate_sequence(orf.sequence_ADN_initiale)
            sequence_list = list(orf.sequence_ADN_initiale)

            for var in variants_list:
                log_file.write(f"\nVariant {var.index}, position : {var.position}: {var.init} > {var.final} - L : {var.frequency_L} - M : {var.frequency_M} - N : {var.frequency_N} \n")

                # Step 1: Is it in the range?
                if not (orf.start <= var.position <= orf.end):
                    log_file.write("En dehors du range de cette ORF.\n")
                    continue

                relative_pos = var.position - orf.start

                # Relative position in range
                if relative_pos < 0 or relative_pos >= len(sequence_list):
                    log_file.write(f"Position relative hors des limites de l’ORF (relative_pos = {relative_pos}).\n")
                    continue

                # Step 2: Does the initial nucleotide match the variant one?
                if sequence_list[relative_pos] != var.init:
                    log_file.write(f"Nucléotide attendu ({var.init}) ≠ trouvé ({sequence_list[relative_pos]}). Mutation non appliquée.\n")
                    continue

                log_file.write("Dans le range et nucléotide initial correct. Application de la mutation.\n")

                # Step 3: Apply the mutation
                temp_sequence = sequence_list.copy()
                temp_sequence[relative_pos] = var.final
                mutated_seq = "".join(temp_sequence)

                # Step 4: Translate and compare
                mutated_protein = orf.translate_sequence(mutated_seq)

                if len(mutated_protein) != len(initial_protein):
                    log_file.write(f"Protéine mutée différente en longueur !\n")
                    log_file.write(f"Protéine mutée :\n{mutated_protein}\n")
                else:
                    log_file.write("Protéine mutée de même longueur que l'initiale.\n")

            log_file.write("\n\n")


# ORFs
my_orf = ORF(
    sequence="ATTTGTAATAATCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAATAATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTGCTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAGCAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCCTTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATTTCATCACAACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCCCTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAGCTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAACACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCAGGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAAGTATTTAG",  # Par exemple, ORF de 0 à 90
    start=6089,
    end=6868, #6866 and the 3 nt from the stop codon
    expected_protein="MCNNLLHSNTHHNRRLWQLTSSPNNRCPRYGVSPHKQHKLLTLTSLSPTPARICYSGGRSRNRLNSLPSLSRELLPPWSLRRPNHLLLTPSRCLLYLRGHQFHHNNYQYKTPCHNPMPNAPLRLIRPNHSSPTSPISPSPSCWHHYTTNRPQPQHHLLRPRRRRRPHSMPTPILIFRSPWSLYSYPTRLRNNLPYCNLLLRKKRTIWMHRYGLSYDINWLPRVYRVSTPYIYSRNRRRHTSMFHLRYHNHRYPHRRQSI*",   # Ou la protéine attendue si tu l’as
    genetic_code=mitalt_genetic_code,
    name="MTALTCO1"
)

orfs = [my_orf]

#orfs = read_fasta_proteins("MTALTCO1.fasta", genome_seq, mitalt_genetic_code)# or mitochondrial_genetic_code

# Variants
variants_list = read_variants_file("variants.csv")

# Apply variants
for orf in orfs:
    orf.apply_variants(variants_list)

# Save results
save_results_to_csv(orfs)
save_log(orfs, variants_list)


print("Résultats sauvegardés dans orf_analysis_results_overall.csv")

print("Log détaillé sauvegardé dans orf_analysis_log_overall.txt")

Résultats sauvegardés dans orf_analysis_results_overall.csv
Log détaillé sauvegardé dans orf_analysis_log_overall.txt
