In [252]:
import re
def parse_fasta(text):
   
    entries = re.split(r'(?m)^>', text.strip())
    result = []
    for entry in entries:
        if not entry:
            continue
        lines = entry.splitlines()
        header = lines[0].strip()
        text = header.split('|')
        gene = text[3] if len(text) > 2 else None
        seq = ''.join(lines[1:]).replace(" ", "")
        result.append((header, seq, gene))
    return result

if __name__ == "__main__":
    with open("ang_utr_positive.fa", "r") as f:
        mir_data_text = f.read()

    records = parse_fasta(mir_data_text)


In [253]:
mir_data = []
for header, seq, gene in records:
    mir_data.append("miARN_" + gene + ".csv")

In [254]:
import pandas as pd

mir_base = {}

for file in mir_data:
    try:
        df = pd.read_csv(file, encoding='utf-8')
        df.columns = df.columns.str.strip()

        if 'Accession_Id' in df.columns and 'Sequence' in df.columns:
            # Nettoyage de la colonne Sequence
            df['Sequence'] = df['Sequence'].dropna().str.strip()

            # Regroupe par Accession_Id et concatène les séquences uniques
            grouped = (
                df.groupby('Accession_Id')['Sequence']
                  .unique()  # obtenir les séquences uniques
                  .apply(lambda arr: ','.join(arr))
            )

            # Transformation en dictionnaire : {Accession_Id: sequences_concaténées}
            mir_base[file] = grouped.to_dict()
        else:
            print(f"Colonnes attendues non trouvées dans {file}")
    except Exception as e:
        print(f"Erreur avec {file}: {e}")

print(mir_base)

{'miARN_APOD.csv': {'hsa-miR-1301-3p': 'UUGCAGCUGCCUGGGAGUGACUUC'}, 'miARN_EGR3.csv': {'hsa-miR-181a-5p': 'AACAUUCAACGCUGUCGGUGAGU', 'hsa-miR-27a-3p': 'UUCACAGUGGCUAAGUUCCGC', 'hsa-miR-375-3p': 'UUUGUUCGUUCGGCUCGCGUGA'}, 'miARN_FGF1.csv': {'hsa-miR-126-5p': 'CAUUAUUACUUUUGGUACGCG', 'hsa-miR-128-3p': 'UCACAGUGAACCGGUCUCUUU', 'hsa-miR-143-3p': 'UGAGAUGAAGCACUGUAGCUC', 'hsa-miR-27a-3p': 'UUCACAGUGGCUAAGUUCCGC', 'hsa-miR-27b-3p': 'UUCACAGUGGCUAAGUUCUGC', 'hsa-miR-28-5p': 'AAGGAGCUCACAGUCUAUUGAG', 'hsa-miR-33a-5p': 'GUGCAUUGUAGUUGCAUUGCA', 'hsa-miR-361-5p': 'UUAUCAGAAUCUCCAGGGGUAC', 'hsa-miR-485-3p': 'GUCAUACACGGCUCUCCUCUCU', 'hsa-miR-495-3p': 'AAACAAACAUGGUGCACUUCUU', 'hsa-miR-708-5p': 'AAGGAGCUUACAAUCUAGCUGGG'}, 'miARN_HOXA3.csv': {'hsa-let-7a-5p': 'UGAGGUAGUAGGUUGUAUAGUU', 'hsa-let-7b-5p': 'UGAGGUAGUAGGUUGUGUGGUU', 'hsa-let-7c-5p': 'UGAGGUAGUAGGUUGUAUGGUU', 'hsa-miR-10a-5p': 'UACCCUGUAGAUCCGAAUUUGUG', 'hsa-miR-10b-5p': 'UACCCUGUAGAACCGAAUUUGUG', 'hsa-miR-1305': 'UUUUCAACUCUAAUGGGAGAGA', 

In [255]:
# Dictionnaire de correspondance pour le complément
complement = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'}

def reverse_complement(seq):
    return ''.join(complement.get(base, base) for base in reversed(seq))

# Traitement des séquences
reverses = {}

for file, acc_dict in mir_base.items():
    reverses[file] = {}
    for acc_id, sequences in acc_dict.items():
        motifs = [motif.strip() for motif in sequences.split(',')]
        rev_list = [reverse_complement(motif[1:9]) for motif in motifs if len(motif) >= 8]
        reverses[file][acc_id] = rev_list


In [256]:
import re

# Supposons records, mir_base, reverse déjà définis

data = {}

# 1. Construire data à partir de records + mir_base + reverse
for header, seq, gene in records:
    file_name = f"miARN_{gene}.csv"
    
    motifs = []
    rev = reverses.get(file_name, {})
    for lst in rev.values():
        motifs.extend(lst if isinstance(lst, (list, tuple)) else [lst])
    motifs = [m for m in motifs if isinstance(m, str) and m]

    mirnas = {}
    for mirna, mir_seq in mir_base.get(file_name, {}).items():
        mirnas[mirna] = [m for m in motifs if m in mir_seq.upper()]
        
    data[header] = {
        "header": header,
        "sequence": seq,
        "comp_mir_data": motifs,
        "mirnas": mirnas
    }

# 2. Recherche et remplacement motif → "N", enregistrement positions
for hdr, info in data.items():
    seq = info["sequence"]
    motifs = info["comp_mir_data"]

    # Remplacements et compilation du motif regex (IGNORECASE)
    replacements = {m: "N" * len(m) for m in motifs}
    pattern = (re.compile("|".join(re.escape(m) for m in motifs), re.IGNORECASE)
               if motifs else None)

    position_dict = {}
    seq_mod = seq.upper()

    if pattern:
        def repl(match):
            m = match.group(0).upper()
            position_dict.setdefault(m, []).append(match.start())
            return replacements.get(m, m)
        seq_mod = pattern.sub(repl, seq_mod)

    no_find = [m for m in motifs if m.upper() not in position_dict]

    info.update({
        "sequence_modifier": seq_mod,
        "position": position_dict,
        "no_find": no_find
    })


In [None]:
import os
import re

os.makedirs("resultats_genes", exist_ok=True)

for header, info in data.items():
    # Exemple d’extraction adaptée, à modifier selon ton header exact
    parts = header.split('|')
    gene_name = parts[3] if len(parts) > 3 else "unknown"

    # Nettoyage du nom pour fichier
    gene_name_clean = re.sub(r'[^a-zA-Z0-9_-]', '_', gene_name)

    output_path = os.path.join("resultats_genes", f"miARN_{gene_name_clean}_output.txt")

    fasta_path = os.path.join("resultats_fasta", f"{gene_name_clean}_output.fasta")

    with open(fasta_path, "a", encoding="utf-8") as f:
        f.write(f">{info['header']}\n")
        f.write(f"{info['sequence_modifier']}\n") 


    with open(output_path, "a", encoding="utf-8") as f:
        f.write(f"Header: {info['header']}\n")
        f.write(f"Motifs recherchés : {info['comp_mir_data']}\n")
        f.write(f"Séquence originale : {info['sequence']}\n")
        f.write(f"Séquence modifiée : {info['sequence_modifier']}\n")
        f.write(f"Positions : {info['position']}\n")
        f.write(f"Motifs non trouvés : {info['no_find']}\n")

       # Associer uniquement les miARN qui ont des motifs trouvés
        if info["position"] and info["mirnas"]:
            f.write("miARN associés (pour motifs trouvés) :\n")
            for mirna, motifs in info["mirnas"].items():
                for motif in motifs:
                    if motif in info["position"]:  # motif effectivement trouvé
                        positions = info["position"][motif]
                        f.write(f"   - {mirna} (motif {motif}) → positions {positions}\n")
        else:
            f.write("miARN associés : Aucun trouvé\n")

        f.write("-" * 50 + "\n")

print("✅ Fichiers générés dans le dossier 'resultats_genes'")


✅ Fichiers générés dans le dossier 'resultats_genes'


In [None]:
rows = []
for hdr, info in data.items():
    parti = hdr.split('|')
    gene_name_new = parti[3] if len(parti) > 3 else "unknown"
    rows.append({
        "Gene": gene_name_new,
        "Position": str(info["position"])  # convertit le dict en string
    })


In [259]:
import csv
fieldnames = ["Gene", "Position"]

with open("resultats_miRNA.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
    writer.writeheader()
    writer.writerows(rows)
