In [1]:
import pandas as pd
import numpy as np

In [2]:
code_postal_ville = [
    #Ile-de-France
    "75000",
    "75001", "75002", "75003", "75004", "75005", "75006", "75007", "75008", "75009", "75010",
    "75011", "75012", "75013", "75014", "75015", "75016", "75017", "75018", "75019", "75020",
    "92100", "92110", "92120", "92130", "92140", "92150", "92160", "92170", "92190",
    "92200", "92210", "92220", "92230", "92240", "92250", "92260", "92270", "92290",
    "92300", "92310", "92320", "92330", "92340", "92350", "92360", "92370", "92380", "92390",   "92400", "92410", "92420", "92430", "92440", "92450", "92460", "92470", "92480", "92490",
    "92500", "92510", "92520", "92530", "92540", "92550", "92560", "92570", "92580", "92590",
    "92600", "92610", "92620", "92630", "92640", "92650", "92660", "92670", "92680", "92690",
    "92700", "92710", "92720", "92730", "92740", "92750", "92760", "92770", "92780", "92790",
    "92800", "92810", "92820", "92830", "92840", "92850", "92860", "92870", "92880", "92890",
    "92900", "92910", "92920", "92930", "92940", "92950", "92960", "92970", "92980", "92990",
    "93000", "93100", "93200", "93300", "93400", "93500", "93600", "93700", "93800", "93900",
    "94000", "94100", "94200", "94300", "94400", "94500", "94600", "94700", "94800", "94900",
    
    #Marseille
    "13000", "13001", "13002", "13003", "13004", "13005", "13006", "13007", "13008", "13009",
    "13010", "13011", "13012", "13013",
    
    #Aix-en-Provence
    "13080", "13100", "13190", "13290", "13540", "13590", "13700", "13800", "13990",
    
    #Lyon
    "69000", "69001", "69002", "69003", "69004", "69005", "69006", "69007", "69008", "69009",
    "69100", "69200", "69300", "69400", "69500", "69600", "69700", "69800", "69900",
    
    #Lille
    "59000", "59100", "59200", "59300", "59400", "59500", "59600", "59700", "59800", "59900",
    
    #Bordeaux
    "33000", "33100", "33200", "33300", "33400", "33500", "33600", "33700", "33800", "33900",
    
    #Toulouse
    "31000", "31100", "31200", "31300", "31400", "31500", "31600", "31700", "31800", "31900",
    
    #Nice
    "06000", "06100", "06200", "06300", "06400", "06500", "06600", "06700", "06800", "06900",
    
    #Nantes
    "44000", "44100", "44200", "44300", "44400", "44500", "44600", "44700", "44800", "44900",
    
    #Strasbourg
    "67000", "67100", "67200", "67300", "67400", "67500", "67600", "67700", "67800", "67900",
    
    #Montpellier
    "34000", "34100", "34200", "34300", "34400", "34500", "34600", "34700", "34800", "34900",
    
    #Rennes
    "35000", "35100", "35200", "35300", "35400", "35500", "35600", "35700", "35800", "35900",
    
    #Grenoble
    "38000", "38100", "38200", "38300", "38400", "38500", "38600", "38700", "38800", "38900",
    
    #Dijon
    "21000", "21100", "21200", "21300", "21400", "21500", "21600", "21700", "21800", "21900",   
    
    #Angers
    "49000", "49100", "49200", "49300", "49400", "49500", "49600", "49700", "49800", "49900",
    
    #Rennes
    "35000", "35100", "35200", "35300", "35400", "35500", "35600", "35700", "35800", "35900",
    
    #Le Havre
    "76000", "76100", "76200", "76300", "76400", "76500", "76600", "76700", "76800", "76900",
    
    #Saint-Étienne
    "42000", "42100", "42200", "42300", "42400", "42500", "42600", "42700", "42800", "42900",
]

# === 1) Chemins des fichiers ===
communes_path = "../Data/communes-france-2025.csv"
adj_path = "../Data/communes_adjacentes_2022_toutes.csv"

# === 2) Charger les fichiers (en texte) ===
communes = pd.read_csv(communes_path, dtype=str)
adj = pd.read_csv(adj_path, dtype=str)

# === 3) Vérifier que les colonnes utiles existent ===
# communes: code INSEE, nom de la commune, codes postaux
if not all(c in communes.columns for c in ["code_insee", "nom_standard", "codes_postaux"]):
    raise ValueError("Le fichier communes doit contenir: code_insee, nom_standard, codes_postaux")

# adjacences: code INSEE source, voisins INSEE
if not all(c in adj.columns for c in ["insee", "insee_voisins"]):
    raise ValueError("Le fichier adjacences doit contenir: insee, insee_voisins")

# === 4) Petite fonction pour découper les listes (séparateur '|') ===
def split_pipe(value):
    """Retourne une liste en séparant par '|' (ou liste vide si NaN)."""
    if pd.isna(value):
        return []
    text = str(value).strip()
    if text == "":
        return []
    return [x.strip() for x in text.split("|") if x.strip()]

# === 5) Préparer la table 'communes' ===
# On garde seulement les colonnes utiles, et on "explose" les codes postaux
communes_simple = communes[["code_insee", "nom_standard", "codes_postaux"]].copy()
communes_simple["liste_cp"] = communes_simple["codes_postaux"].apply(split_pipe)
communes_cp = communes_simple.explode("liste_cp", ignore_index=True)  # une ligne par code postal
communes_cp = communes_cp.rename(columns={
    "nom_standard": "commune",
    "liste_cp": "code_postal"
})

# === 6) Préparer la table 'adjacences' ===
# On "explose" les voisins INSEE (une ligne par voisin)
adj["voisin_insee"] = adj["insee_voisins"].apply(split_pipe)
adj_long = adj.explode("voisin_insee", ignore_index=True)
adj_long = adj_long.dropna(subset=["voisin_insee"])  # garder seulement les lignes avec un voisin

# === 7) Joindre pour récupérer les infos de la banlieue (voisine) ===
# On va chercher, pour chaque voisin_insee, son nom et ses codes postaux
banlieue_infos = communes_simple.rename(columns={
    "code_insee": "code_insee_banlieue",
    "nom_standard": "banlieue",
    "codes_postaux": "codes_postaux_banlieue"
})

adj_avec_banlieue = adj_long.merge(
    banlieue_infos,
    left_on="voisin_insee",
    right_on="code_insee_banlieue",
    how="left"
)

# === 8) Joindre pour récupérer les infos de la commune source + son code postal (explosé) ===
# On relie le code INSEE source (adj["insee"]) au code INSEE des communes (communes_cp["code_insee"])
final = adj_avec_banlieue.merge(
    communes_cp[["code_insee", "commune", "code_postal"]],
    left_on="insee",
    right_on="code_insee",
    how="left"
)

# === 9) Ne garder que les colonnes finales, enlever doublons et lignes incomplètes ===
banlieues_df = final[[
    "code_postal",           # CP de la commune source
    "commune",               # Nom de la commune source
    "insee",                 # INSEE source
    "banlieue",              # Nom de la commune voisine (banlieue)
    "code_insee_banlieue",   # INSEE de la banlieue
    "codes_postaux_banlieue" # CP(s) de la banlieue (séparés par '|')
]].rename(columns={"insee": "code_insee_source"})

# On enlève les lignes où il manque l'essentiel
banlieues_df = banlieues_df.dropna(subset=["code_postal", "commune", "banlieue"])
banlieues_df = banlieues_df.drop_duplicates()

# === Filtrer une zone, ex. Île-de-France (75, 92, 93, 94) ===, Paris, Marseille, 
banlieues_df = banlieues_df[banlieues_df["code_postal"].str.match(r"^(75|13|59|49|80|44|38)")]

# ---  Normaliser les codes postaux  ---
banlieues_df["code_postal"] = (
    banlieues_df["code_postal"]
    .astype(str)
    .str.extract(r"(\d{2,5})", expand=False)  # récupère les chiffres principaux
    .fillna("")
    .str.zfill(5)
)

# ---  Filtrer : ne garder que les lignes dont le CP SOURCE est dans ta liste ---
cp_set = set(code_postal_ville)
extrait_source = banlieues_df[banlieues_df["code_postal"].isin(cp_set)].copy()

# ---  inclure aussi si la BANLIEUE possède un CP dans ta liste ---
def banlieue_a_cp_dans_liste(cell, cp_set):
    """Retourne True si au moins un CP (séparés par '|') de la banlieue est dans code_postal_ville."""
    if pd.isna(cell) or str(cell).strip() == "":
        return False
    return any(cp.strip().zfill(5) in cp_set for cp in str(cell).split("|"))

mask_banlieue = banlieues_df["codes_postaux_banlieue"].apply(lambda x: banlieue_a_cp_dans_liste(x, cp_set))
extrait_banlieue = banlieues_df[mask_banlieue].copy()
code_postal_ville.extend(extrait_banlieue["code_postal"].tolist())
dfliste = pd.Series(code_postal_ville)

outfile = "../DataCleaned/metropoleetbanlieues_liste.csv"
dfliste.to_csv(outfile, index=False, encoding="utf-8")


