In [2]:
from Bio import SeqIO
from Bio import SeqIO
import csv
import os
import re

In [3]:
def chunk_sequence(seq, chunk_size=512, stride=256):
    seq = seq.upper()
    for i in range(0, len(seq) - chunk_size + 1, stride):
        yield seq[i:i + chunk_size]


In [4]:
def clean_sequence(seq):
    seq = seq.upper()
    seq = re.sub(r"[^ACGTN]", "", seq)
    return seq

In [5]:
def fasta_to_csv(fasta_path, csv_writer, label, name, source="GenBank", min_len=200 ):
    count = 0
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = clean_sequence(str(record.seq))
        if len(seq) < min_len:
            continue

        seq_id = record.id

        csv_writer.writerow([
            seq_id,
            name,
            seq,
            label,
            source,
            len(seq)
        ])
        count += 1

    print(f"[✓] {count} séquences enregistrees ")

In [6]:
def build_dataset(fasta_path, output_csv, isGMO=False):
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    
    filename = os.path.basename(fasta_path)
    name = os.path.splitext(filename)[0]
    seqName = "_".join(name.split("_")[:2])

    with open(output_csv, "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "name", "sequence", "label", "source", "length"])

        fasta_to_csv(
            fasta_path=fasta_path,
            csv_writer=writer,
            label= 1 if isGMO else 0,
            name=seqName
        )

    print(f"[✓] Dataset Enregistree vers {output_csv}")

In [24]:
hirsutum_fasta="raw/Gossypium_hirsutum_v2.1_genomic.fna"
output_csv="processed/datas.csv"
build_dataset(fasta_path=hirsutum_fasta, output_csv=output_csv, isGMO=False)

[✓] 1027 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [7]:
unguiculata_fasta="raw/Vigna_unguiculata_v2_genomic.fna"
output_csv="processed/datas.csv"
build_dataset(fasta_path=unguiculata_fasta, output_csv=output_csv, isGMO=False)

[✓] 676 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [22]:
cry1ac_fasta="raw/bt_cry1Ac_sequences.fasta"
output_csv="processed/datas.csv"
build_dataset(fasta_path=cry1ac_fasta, output_csv=output_csv, isGMO=True)

[✓] 20 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [23]:
cp4epsps_fasta="raw/bt_cp4epsps_sequences.fasta"
output_csv="processed/datas.csv"
build_dataset(fasta_path=cp4epsps_fasta, output_csv=output_csv, isGMO=True)

[✓] 11 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv


In [8]:
cotton_gmo_fasta="raw/cotton_gmo.fna"
output_csv="processed/datas.csv"
build_dataset(fasta_path=cotton_gmo_fasta, output_csv=output_csv, isGMO=True)

[✓] 3 séquences enregistrees 
[✓] Dataset Enregistree vers processed/datas.csv
