In [78]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
import joblib
import os

# Réglages

DATA_PATH = "new_data.csv"
CHUNKSIZE = 500_000
RESULTS_CSV = "douane_test_results.csv"

def safe_div(a, b):
    return a / (b + 1e-9)

# Supprimer ancien fichier résultats
if os.path.exists(RESULTS_CSV):
    os.remove(RESULTS_CSV)

first_chunk = True
all_results = []

for chunk in pd.read_csv(DATA_PATH, delimiter=";", encoding="latin1", chunksize=CHUNKSIZE):
    df = chunk.copy()

    # Colonnes pertinentes
    cols_valeur = [
        "assessed_value_usd", "value_per_kg", "net_weight_kg", "gross_weight_kg",
        "AVG_FOB_x", "AVG_FOB_y", "AVG_TAX_x", "AVG_TAX_y",
        "AVG_FOB_3MTH", "AVG_TAX_3MTH", "DECL_FREQ_x", "DECL_FREQ_y"
    ]
    for c in cols_valeur:
        if c not in df.columns:
            df[c] = 0.0

    df_valeur = df[cols_valeur].copy()

In [79]:
"""# Feature engineering
df_valeur["val_per_net_weight"] = df["assessed_value_usd"] / (df["net_weight_kg"] + 1e-6) #prix unitaire par kg
df_valeur["val_vs_avg_fob_x"]   = df["assessed_value_usd"] / (df["AVG_FOB_x"] + 1e-6) #comparaison avec prix moyens de référence.
df_valeur["val_vs_avg_fob_y"]   = df["assessed_value_usd"] / (df["AVG_FOB_y"] + 1e-6)
df_valeur["gross_net_ratio"]    = (df["gross_weight_kg"] + 1e-6) / (df["net_weight_kg"] + 1e-6) #cohérence poids brut / net."""

# Feature engineering
 
df_valeur["val_per_net_weight"] = safe_div(df["assessed_value_usd"], df["net_weight_kg"])
df_valeur["val_vs_avg_fob_x"] = safe_div(df["assessed_value_usd"], df["AVG_FOB_x"])
df_valeur["val_vs_avg_fob_y"] = safe_div(df["assessed_value_usd"], df["AVG_FOB_y"])
df_valeur["gross_net_ratio"] = safe_div(df["gross_weight_kg"], df["net_weight_kg"])

# Nettoyage

df_valeur.replace([np.inf, -np.inf], np.nan, inplace=True)
df_valeur.fillna(0, inplace=True)

# Standardisation

if first_chunk:
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_valeur)
    joblib.dump(scaler, "scaler.pkl")
else:
    scaler = joblib.load("scaler.pkl")
    X_scaled = scaler.transform(df_valeur)

In [80]:
# Isolation Forest

if first_chunk:
    iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
    iso.fit(X_scaled)
    joblib.dump(iso, "isolation_forest_model.pkl")
else:
    iso = joblib.load("isolation_forest_model.pkl")
df["iso_pred"] = iso.predict(X_scaled)
df["iso_pred"] = df["iso_pred"].map({-1: 1, 1: 0})
df["iso_score"] = iso.decision_function(X_scaled)

In [81]:
# One-Class SVM

if first_chunk:
    ocsvm = OneClassSVM(kernel="rbf", gamma="auto", nu=0.05)
    ocsvm.fit(X_scaled)
    joblib.dump(ocsvm, "oneclass_svm_model.pkl")
else:
    ocsvm = joblib.load("oneclass_svm_model.pkl")
df["svm_pred"] = ocsvm.predict(X_scaled)
df["svm_pred"] = df["svm_pred"].map({-1: 1, 1: 0})
df["svm_score"] = ocsvm.decision_function(X_scaled)

In [82]:
# Autoencoder
 
input_dim = X_scaled.shape[1]
encoding_dim = min(6, input_dim // 2)

if first_chunk:
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu", activity_regularizer=regularizers.l1(1e-5))(input_layer)
    decoder = Dense(input_dim, activation="linear")(encoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer="adam", loss="mse")
    autoencoder.fit(X_scaled, X_scaled, epochs=20, batch_size=64, shuffle=True, verbose=0)
    autoencoder.save("autoencoder_model.h5")
else:
    autoencoder = load_model("autoencoder_model.h5")

reconstructions = autoencoder.predict(X_scaled, batch_size=64)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)
threshold = np.percentile(mse, 95)
df["ae_pred"] = (mse > threshold).astype(int)
df["ae_score"] = mse



[1m4956/4956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step


In [83]:
# Comparaison des modèles

results_chunk = {}
for model in ["iso_pred", "svm_pred", "ae_pred"]:
    anomalies = df[model].sum()
    prop = anomalies / len(df)
    try:
        sil = silhouette_score(X_scaled, df[model])
        db = davies_bouldin_score(X_scaled, df[model])
    except:
        sil, db = None, None
    results_chunk[model] = {
        "Anomalies détectées": anomalies,
        "Proportion": round(prop, 3),
        "Silhouette": sil,
        "Davies-Bouldin": db
    }
all_results.append(results_chunk)

In [84]:
# Sauvegarde résultats chunk par chunk

df.to_csv("RESULTS.csv", mode='a', index=False, header=first_chunk)
first_chunk = False

In [None]:
"""total_rows = len(df)  # nombre total de lignes dans ton dataset

final_results = {}

for model in ["iso_pred", "svm_pred", "ae_pred"]:
    anomalies_detectees = sum(chunk[model]["Anomalies détectées"] for chunk in all_results)
    
    final_results[model] = {
        "Anomalies détectées": anomalies_detectees,
        "Proportion": round(anomalies_detectees / total_rows, 3),
        "Proportion (%)": round(100 * anomalies_detectees / total_rows, 2)
    }

# Transformer en DataFrame
results_df = pd.DataFrame(final_results).T

# Trier par anomalies détectées
results_df = results_df.sort_values(by="Anomalies détectées", ascending=False)

# Afficher top 5
print("\n Comparaison des modèles (Top 5) :")
print(results_df.head(5))

# Sauvegarder le tableau en CSV
results_df.to_csv("comparaison_models.csv", sep=";", encoding="utf-8")
"""

# Analyse des anomalies détectées

# Seuils automatiques pour chaque modèle
iso_threshold = df["iso_score"].quantile(0.05)   # 5% les plus faibles
svm_threshold = df["svm_score"].quantile(0.05)   # 5% les plus faibles
ae_threshold  = df["ae_score"].quantile(0.95)    # 5% les plus élevées

df["iso_severity"] = np.where(df["iso_score"] < iso_threshold, "Forte", "Faible")
df["svm_severity"] = np.where(df["svm_score"] < svm_threshold, "Forte", "Faible")
df["ae_severity"]  = np.where(df["ae_score"]  > ae_threshold, "Forte", "Faible")


# Résumé final avec gravité

summary = {}
for model, pred_col, sev_col in [
    ("IsolationForest", "iso_pred", "iso_severity"),
    ("OneClassSVM", "svm_pred", "svm_severity"),
    ("AutoEncoder", "ae_pred", "ae_severity")
]:
    anomalies = df[df[pred_col] == 1]
    total = len(df)
    summary[model] = {
        "Anomalies détectées": len(anomalies),
        "Proportion (%)": round(len(anomalies) / total * 100, 2),
        "Forte suspicion": (anomalies[sev_col] == "Forte").sum(),
        "Faible suspicion": (anomalies[sev_col] == "Faible").sum()
    }

results_df = pd.DataFrame(summary).T
print("\n Résultats comparatifs avec gravité ")
print(results_df)

# Top 5 anomalies les plus graves (selon AutoEncoder par exemple)

top5_ae = df.sort_values("ae_score", ascending=False).head(5)
print("\n Top 5 anomalies AutoEncoder (fortement suspectes) ")
print(top5_ae[["instanceid", "nif_imp", "hs_cod", "assessed_value_usd", "ae_score", "ae_severity"]])



=== Résultats comparatifs avec gravité ===
                 Anomalies détectées  Proportion (%)  Forte suspicion  \
IsolationForest              15860.0             5.0          15860.0   
OneClassSVM                  15861.0             5.0          15860.0   
AutoEncoder                  15860.0             5.0          15860.0   

                 Faible suspicion  
IsolationForest               0.0  
OneClassSVM                   1.0  
AutoEncoder                   0.0  

=== Top 5 anomalies AutoEncoder (fortement suspectes) ===
         instanceid     nif_imp      hs_cod  assessed_value_usd      ae_score  \
4124705     2864316  0000012107  84522900.0            72867.25  28151.987167   
4303396     2940566  1000002414  38229000.0             1316.52  10940.106134   
4125281     2864516  1000018108  44209000.0              169.65   3825.234230   
4268181     2925017  3000009906  29349900.0              936.82   1897.212071   
4281813     2930368  2000000565  88023000.0         115