In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
import joblib


df = pd.read_csv("new_data.csv", delimiter=";", encoding="latin1")
df.head()

In [None]:
# Feature engineering
df["gross_net_ratio"] = df["gross_weight_kg"] / (df["net_weight_kg"] + 1e-6)
df["weight_per_item"] = df["net_weight_kg"] / (df["item_count"] + 1e-6)
df["val_per_net_weight"] = df["assessed_value_usd"] / (df["net_weight_kg"] + 1e-6)

# Colonnes pertinentes
features = [
    "gross_net_ratio",
    "weight_per_item",
    "val_per_net_weight",
    "hs_code_diversity",
    "value_per_kg"
]
X = df[features].replace([np.inf, -np.inf], np.nan).fillna(0)

In [2]:
# Standardisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
# Isolation Forest
iso = IsolationForest(contamination=0.05, random_state=42)
df["iso_pred"] = iso.fit_predict(X_scaled)
df["iso_pred"] = df["iso_pred"].map({-1: 1, 1: 0})

In [None]:
# One-Class SVM
ocsvm = OneClassSVM(kernel="rbf", gamma="auto", nu=0.05)
df["svm_pred"] = ocsvm.fit_predict(X_scaled)
df["svm_pred"] = df["svm_pred"].map({-1: 1, 1: 0})

In [None]:
# Autoencoder
input_dim = X_scaled.shape[1]
encoding_dim = 3

# Réseau autoencodeur
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu", activity_regularizer=regularizers.l1(1e-5))(input_layer)
decoder = Dense(input_dim, activation="sigmoid")(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, shuffle=True, verbose=0)

# Reconstruction
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)
threshold = np.percentile(mse, 95) 
df["ae_pred"] = (mse > threshold).astype(int)  # 1 = anomalie

In [None]:
# Comparaison métriques
results = {}

for model in ["iso_pred", "svm_pred", "ae_pred"]:
    anomalies = df[model].sum()
    prop = anomalies / len(df)
    try:
        sil = silhouette_score(X_scaled, df[model])
        db = davies_bouldin_score(X_scaled, df[model])
    except:
        sil, db = None, None
    results[model] = {
        "Anomalies détectées": anomalies,
        "Proportion": round(prop, 3),
        "Silhouette": sil,
        "Davies-Bouldin": db
    }

In [None]:
# Affichage des résultats comparatifs
results_df = pd.DataFrame(results).T
print(results_df)

# Sauvegarde
df.to_csv("fraude_quantite_models.csv", index=False)