In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import joblib


# 1) Chargement des données
df = pd.read_csv("new_data.csv", delimiter=";", encoding="latin1")

In [None]:
# 2) Préparer features pour fraude d'espèce
# Nettoyage minimal des textes
"""df['item_desc_text'] = df['item_desc'].fillna('').astype(str).str.lower()
if df['item_desc_text'].str.strip().eq('').all():
    df['item_desc_text'] = 'no_description'  # permet TF-IDF même si vide

# TF-IDF sur description d'article
tfidf = TfidfVectorizer(max_features=300, stop_words=None)
tfidf_mat = tfidf.fit_transform(df['item_desc_text'])

# Réduction dimensionnelle pour éviter trop de dimensions
pca_desc = PCA(n_components=min(10, tfidf_mat.shape[1]-1) if tfidf_mat.shape[1]>1 else 1)
desc_emb = pca_desc.fit_transform(tfidf_mat.toarray()) if tfidf_mat.shape[1] > 1 else np.zeros((len(df),1))"""

# Features numériques liées à l'espèce / type de produit
df['net_weight_kg'] = df['net_weight_kg'].fillna(df['gross_weight_kg']*0.98)
df['gross_net_ratio'] = df['gross_weight_kg'] / (df['net_weight_kg'] + 1e-6)
df['item_count'] = df['item_count'].fillna(1)
df['weight_per_item'] = df['net_weight_kg'] / df['item_count']

# Encodage fréquence des pays et codes HS (mesure rareté)
df['origin_country'] = df['origin_country'].fillna('UNKNOWN').astype(str)
df['hs_cod'] = df['hs_cod'].fillna('UNK').astype(str)

# encodages simples en comptant fréquence
origin_freq = df['origin_country'].value_counts(normalize=True).to_dict()
hs_freq = df['hs_cod'].value_counts(normalize=True).to_dict()
df['origin_country_freq'] = df['origin_country'].map(origin_freq)
df['hs_cod_freq'] = df['hs_cod'].map(hs_freq)

# Sélection des features numériques
num_features = ['net_weight_kg', 'gross_net_ratio', 'weight_per_item', 
                'origin_country_freq', 'hs_cod_freq']

# Remplir les NaN avant np.hstack
df[num_features] = df[num_features].fillna(0)

# Créer la matrice X
X = np.hstack([df[num_features].values])  

# Standardisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# 3) Isolation Forest pour détecter anomalies
iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
df['fraude_spec'] = iso.fit_predict(X_scaled)
df['fraude_spec'] = df['fraude_spec'].map({-1:1, 1:0}
df['score_fraude_spec'] = iso.decision_function(X_scaled)

In [10]:
# 4) Sauvegarde du modèle et du scaler
joblib.dump((iso, scaler), "model_spec.joblib")

['model_spec.joblib']

In [11]:
# 5) Résultat
df[['instanceid','fraude_spec','score_fraude_spec']].head()

Unnamed: 0,instanceid,fraude_spec,score_fraude_spec
0,33095,0,0.086913
1,33096,1,-0.073261
2,33096,0,0.028229
3,33097,1,-0.083298
4,33098,0,0.068808
