In [1]:
from Bio import SeqIO
import random
import csv
import os
import re

In [3]:
def chunk_sequence(seq, chunk_size=512, stride=256):
    seq = seq.upper()
    for i in range(0, len(seq) - chunk_size + 1, stride):
        yield seq[i:i + chunk_size]


In [4]:
def clean_sequence(seq):
    seq = seq.upper()
    seq = re.sub(r"[^ACGTN]", "", seq)
    return seq

In [5]:
def fasta_to_csv(fasta_path, csv_writer, label, name, source="GenBank", min_len=200 ):
    count = 0
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = clean_sequence(str(record.seq))
        if len(seq) < min_len:
            continue

        seq_id = record.id

        csv_writer.writerow([
            seq_id,
            name,
            seq,
            label,
            source,
            len(seq)
        ])
        count += 1

    print(f"[✓] {count} séquences enregistrees ")

In [6]:
def build_dataset(fasta_path, output_csv, isGMO=False):
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    
    filename = os.path.basename(fasta_path)
    name = os.path.splitext(filename)[0]
    seqName = "_".join(name.split("_")[:2])

    with open(output_csv, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "name", "sequence", "label", "source", "length"])

        fasta_to_csv(
            fasta_path=fasta_path,
            csv_writer=writer,
            label= 1 if isGMO else 0,
            name=seqName
        )

    print(f"[✓] Dataset Enregistree vers {output_csv}")

In [24]:
hirsutum_fasta="raw/Gossypium_hirsutum_v2.1_genomic.fna"
output_csv="processed/datas.csv"
build_dataset(fasta_path=hirsutum_fasta, output_csv=output_csv, isGMO=False)

[✓] 1027 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [7]:
unguiculata_fasta="raw/Vigna_unguiculata_v2_genomic.fna"
output_csv="processed/datas.csv"
build_dataset(fasta_path=unguiculata_fasta, output_csv=output_csv, isGMO=False)

[✓] 676 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [22]:
cry1ac_fasta="raw/bt_cry1Ac_sequences.fasta"
output_csv="processed/datas.csv"
build_dataset(fasta_path=cry1ac_fasta, output_csv=output_csv, isGMO=True)

[✓] 20 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [23]:
cp4epsps_fasta="raw/bt_cp4epsps_sequences.fasta"
output_csv="processed/datas.csv"
build_dataset(fasta_path=cp4epsps_fasta, output_csv=output_csv, isGMO=True)

[✓] 11 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [8]:
cotton_gmo_fasta="raw/cotton_gmo.fna"
output_csv="processed/datas.csv"
build_dataset(fasta_path=cotton_gmo_fasta, output_csv=output_csv, isGMO=True)

[✓] 3 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


Synthetisation de GMO

In [2]:
def clean(seq):
    seq = seq.upper()
    return re.sub(r"[^ACGT]", "", seq)

def random_subseq(seq, min_len, max_len):
    L = len(seq)
    size = random.randint(min_len, max_len)
    if L <= size:
        return seq
    start = random.randint(0, L - size)
    return seq[start:start+size]

def mutate(seq, rate=0.001):
    bases = ["A","C","G","T"]
    seq = list(seq)
    for i in range(len(seq)):
        if random.random() < rate:
            seq[i] = random.choice(bases)
    return "".join(seq)

In [3]:
def build_synthetic_gmo(plant_genomes, promoter_seqs, gene_seqs, terminator_seqs,
        n_samples=1000, flank_min=300, flank_max=2000, mutation_rate=0.0005):

    samples = []

    for i in range(n_samples):
        plant = random.choice(plant_genomes)
        promoter = random.choice(promoter_seqs)
        gene = random.choice(gene_seqs)
        terminator = random.choice(terminator_seqs)

        left_flank  = random_subseq(plant, flank_min, flank_max)
        right_flank = random_subseq(plant, flank_min, flank_max)

        synthetic = (left_flank + promoter + gene + terminator + right_flank)

        synthetic = mutate(synthetic, mutation_rate)
        
        samples.append({
            "id": f"GMO_GOS_SYN_{i:06d}",
            "name" : "synthetic_GMO",
            "sequence": synthetic,
            "label": 1,
            "source": "synthetic_GMO",
            "length": len(synthetic)
        })

    return samples

In [4]:

def load_fasta(path):
    seqs = []
    for rec in SeqIO.parse(path, "fasta"):
        seqs.append(clean(str(rec.seq)))
    return seqs

In [5]:
def append_to_csv(samples, csv_path):
    file_exists = os.path.exists(csv_path)
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)

    with open(csv_path, "a", newline="") as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["id", "name", "sequence", "label", "source", "length"])

        for s in samples:
            writer.writerow([s["id"], s["name"], s["sequence"], s["label"], s["source"], s["length"]])

In [8]:
plant_genomes = load_fasta("raw/Vigna_unguiculata_v2_genomic.fna")
promoters     = load_fasta("raw/promoters.fasta")
genes         = load_fasta("raw/gmo_genes.fasta")
terminators   = load_fasta("raw/terminators.fasta")

In [9]:
synthetic_samples = build_synthetic_gmo(plant_genomes=plant_genomes, promoter_seqs=promoters, gene_seqs=genes,
    terminator_seqs=terminators, n_samples=5000, flank_min=500, flank_max=3000, mutation_rate=0.0003)

append_to_csv(synthetic_samples, csv_path="processed/datas.csv")

print(f"[✓] Generated {len(synthetic_samples)} synthetic GMO sequences")

[✓] Generated 5000 synthetic GMO sequences
