# Pipeline de nettoyage & analyse — version professionnelle

Ce notebook contient :
- une **architecture modulaire** basée sur des fonctions réutilisables
- un **bloc `main()`** pour exécuter les étapes souhaitées
- un **contrôle global `VERBOSE`** (vprint) pour réactiver tous les `print()` historiques
- un **schéma de workflow** (Mermaid) à visualiser


**Remplace `xxxxxxxxx` par ton chemin local si tu veux enregistrer ou charger depuis un dossier spécifique.**


> Utilise les cellules de code ci-dessous pour t'adapter ou exécuter étapes par étapes.


```mermaid
flowchart TD
  A[Chargement CSVs] --> B[Construction du graphe NetworkX]
  B --> C[Ajout attributs nodes]
  C --> D[Extraction : nœuds Drug]
  D --> E[Parsing: 'AND', ';', parenthèses]
  E --> F[Merging & dédoublonnage]
  F --> G[Comparaison avec drugs_used.csv]
  G --> H[Export / Visualisations]
```

*Si ton environnement Jupyter n'affiche pas Mermaid, ce diagramme fournit la logique séquentielle à suivre.*


In [None]:
# Globals & vprint
VERBOSE = True  # mettre False pour couper tous les vprints

def vprint(msg):
    if VERBOSE:
        print(msg)

In [None]:
# Loaders
import csv

def load_nodes(path):
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        nodes = [r for r in reader][1:]
    node_names = [n[0] for n in nodes]
    vprint(f"Loaded {len(node_names)} nodes from {path}")
    return nodes, node_names

def load_edges(path):
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        edges = [tuple(r) for r in reader][1:]
    vprint(f"Loaded {len(edges)} edges from {path}")
    return edges

In [None]:
# Graph builder
import networkx as nx

def build_graph(node_names, nodes, edges):
    G = nx.Graph()
    G.add_nodes_from(node_names)
    G.add_edges_from(edges)
    type_dict = {n[0]: n[1] for n in nodes}
    descr_dict = {n[0]: n[2] for n in nodes}
    nx.set_node_attributes(G, type_dict, 'type')
    nx.set_node_attributes(G, descr_dict, 'description')
    vprint('Graph built: nodes={}, edges={}'.format(G.number_of_nodes(), G.number_of_edges()))
    return G, descr_dict

In [None]:
# Drug parsing utilities
import re

def extract_drugs(descr_dict):
    drug_list = [k for k,v in descr_dict.items() if v.upper()=='DRUG']
    vprint(f"Extracted {len(drug_list)} drug nodes")
    return drug_list

def parse_and(drug_list):
    parsed = []
    for d in drug_list:
        if ' AND ' in d:
            parsed.extend([x.strip() for x in d.split(' AND ')])
        parsed.append(d.strip())
    return list(dict.fromkeys(parsed))

def parse_semicolon(drug_list):
    parsed = []
    for d in drug_list:
        if '; ' in d:
            parsed.extend([x.strip() for x in d.split('; ')])
        parsed.append(d.strip())
    return list(dict.fromkeys(parsed))

def remove_parentheses(drug_list):
    cleaned = []
    for d in drug_list:
        if re.search(r'\s\([^()]*\)', d):
            cleaned.append(re.sub(r'\s\([^()]*\)', '', d))
    return list(dict.fromkeys(cleaned))

def merge_unique(*lists):
    merged = []
    for L in lists:
        merged.extend(L)
    return list(dict.fromkeys(merged))

In [None]:
# Parse drugs used in trials
def parse_drugs_in_trials(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = [ln.strip().upper() for ln in f if ln.strip()]
    parsed = []
    for ln in lines:
        parsed.extend(ln.split(' '))
    return parsed

In [None]:
# Main pipeline example
import time
def run_pipeline(node_file='COVID19_GDDS_nodes.csv',
                 edge_file='COVID19_GDDS_edges.csv',
                 trials_file='drugs_used.csv',
                 verbose=True):
    global VERBOSE
    VERBOSE = verbose
    t0 = time.time()
    nodes, node_names = load_nodes(node_file)
    edges = load_edges(edge_file)
    G, descr = build_graph(node_names, nodes, edges)
    drugs = extract_drugs(descr)
    p_and = parse_and(drugs)
    p_sc = parse_semicolon(drugs)
    merged = merge_unique(drugs, p_and, p_sc)
    no_par = remove_parentheses(merged)
    final = merge_unique(merged, no_par)
    trials = parse_drugs_in_trials(trials_file)
    vprint(f'Summary: initial drugs={len(drugs)}, parsed_and={len(p_and)}, parsed_semicolon={len(p_sc)}, final={len(final)}, trials_parsed={len(trials)}')
    # small check
    found_camostat = [d for d in final if 'CAMOSTAT' in d.upper()]
    if found_camostat:
        vprint('Camostat hits: ' + ', '.join(found_camostat))
    t1 = time.time()
    vprint(f'Elapsed: {t1-t0:.2f} s')
    return {
        'G': G, 'descr': descr, 'drugs': drugs,
        'parsed_and': p_and, 'parsed_semicolon': p_sc,
        'merged': merged, 'no_parentheses': no_par,
        'final': final, 'trials_parsed': trials
    }

In [None]:
# Usage example
# Remplace les chemins par les tiens si nécessaire (ou mets xxxxxxxxx devant)
res = run_pipeline(node_file='COVID19_GDDS_nodes.csv',
                   edge_file='COVID19_GDDS_edges.csv',
                   trials_file='drugs_used.csv',
                   verbose=True)

# Exporte la liste finale dans un CSV
import csv
with open('/mnt/data/final_drug_list.csv', 'w', encoding='utf-8') as out:
    w = csv.writer(out)
    for d in res['final']:
        w.writerow([d])
vprint('final_drug_list.csv written to /mnt/data/')

## Conseils d'utilisation

- Pour exécuter pas à pas : exécute cellule par cellule.
- Pour exécuter le pipeline complet : exécute la cellule `Usage example`.
- Si tu veux stocker ailleurs, remplace `/mnt/data/` par `xxxxxxxxx/ton_dossier`.
- Si ton Jupyter n'affiche pas Mermaid, tu peux installer une extension (ex: jupyterlab-mermaid) ou copier le diagramme dans un rendu Mermaid en ligne.
