In [59]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix
from numpy import where
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
PROJECT_PATH = "C:/Users/HP/Documents/PIP2022/ProjectInterPromo/"

In [11]:
def evaluation(pred: np.ndarray, y: np.ndarray, display=True):
    """[summary]

    Args:
        pred (np.ndarray): [description]
    """
    confusion_mat = confusion_matrix(y, pred)
    
    tn, fp, fn, tp = confusion_mat.ravel()
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * precision * recall / (precision + recall)
    
    if display:
        print("TN / FP \nFN / TP")
        print(confusion_mat)
        print(f"Precision : {precision}")
        print(f"Recall : {recall}")
        print(f"F1_Score : {f1_score}")
    
    return tn, fp, fn, tp

In [33]:
# Lecture du fichier de donnée
data_v0 = pd.read_csv(PROJECT_PATH + "data/SORTIE/Donnees_v0/Donnees_V0_NAN_supprimes.csv")

print("Nombre de fraude :", sum(data_v0["TOP_FRAUDE"]))
print("Nombre de données : ", len(data_v0))
print("Ratio : ", sum(data_v0["TOP_FRAUDE"]) / len(data_v0) * 100)

# Suppression des variables qualitative
quali_col = []
count = 0
for i, name in zip(data_v0.dtypes, data_v0.columns):
    if i == "object":
        quali_col.append(name)

data_quanti = data_v0.drop(columns=quali_col + ["Unnamed: 0"])

# 
data_quanti = data_quanti.sample(10000)

print("------------ Sur un echantillion ------------")
print("Nombre de fraude :", sum(data_quanti["TOP_FRAUDE"]))
print("Nombre de données : ", len(data_quanti))
print("Ratio : ", sum(data_quanti["TOP_FRAUDE"]) / len(data_quanti) * 100)

Nombre de fraude : 1479
Nombre de données :  644348
Ratio :  0.22953435100287423
------------ Sur un echantillion ------------
Nombre de fraude : 19
Nombre de données :  10000
Ratio :  0.19


# Modéle SVM

In [61]:
X = data_quanti.drop(columns = ["TOP_FRAUDE", "TOP_FRAUDE_CARTE", "TOP_FRAUDE_VIREMENT"])
model = OneClassSVM(kernel = 'rbf', gamma = 0.01, nu = 0.20/100, verbose = True).fit(X)

[LibSVM]

In [62]:
y_pred = model.predict(X)
len(y_pred)

10000

In [63]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

In [64]:
set(y_pred)

{0, 1}

In [48]:
tn, fp, fn, tp = evaluation(y_pred, data_quanti["TOP_FRAUDE"], display=True)

# L'air sous la courbe
print("true negative : ", tn)
print("false positive : ", fp)
print("false negative : ", fn)
print("true positive : ", tp)

TN / FP 
FN / TP
[[9439  542]
 [  17    2]]
Precision : 0.003676470588235294
Recall : 0.10526315789473684
F1_Score : 0.007104795737122557
true negative :  9439
false positive :  542
false negative :  17
true positive :  2


In [43]:
# filter outlier index
outlier_index = where(y_pred == 1)

# filter outlier values
outlier_values = data_quanti.iloc[outlier_index]
outlier_values

Unnamed: 0,TOP_FRAUDE,TOP_FRAUDE_CARTE,TOP_FRAUDE_VIREMENT,TOP_E_RELEVE,MESSAGERIE_WEB_ACTIVE,FIAB_TEL_DOMICILE,FIAB_TEL_TRAVAIL,FIAB_TEL_PORTABLE,FIAB_EMAIL,REFUS_SEA,...,top_produit_Credit,top_produit_Epargne,top_produit_Services,mt_factu_ttc_M,mt_factu_ttc_M6,mt_reduc_ttc_M,mt_reduc_ttc_M6,mt_rist_ttc_M,mt_rist_ttc_M6,Flag_bq_principale
382404,0,0,0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,14.0,6.0,0.0,0.0,0.0,0.0,1.0
406110,0,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,6.0,6.0,0.0,0.0,0.0,0.0,1.0
599534,0,0,0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,14.0,18.0,0.0,0.0,0.0,0.0,1.0
352262,0,0,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,12.0,6.0,0.0,0.0,0.0,0.0,1.0
79411,0,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,9.0,10.0,0.0,0.0,0.0,0.0,1.0
580019,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,8.0,8.0,0.0,0.0,0.0,0.0,1.0
237408,0,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,6.0,6.0,0.0,0.0,0.0,0.0,1.0
402141,0,0,0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0
417182,0,0,0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,10.0,9.0,0.0,0.0,0.0,0.0,1.0
107746,0,0,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,14.0,55.0,0.0,0.0,0.0,43.0,1.0


Index(['TOP_E_RELEVE', 'MESSAGERIE_WEB_ACTIVE', 'FIAB_TEL_DOMICILE',
       'FIAB_TEL_TRAVAIL', 'FIAB_TEL_PORTABLE', 'FIAB_EMAIL', 'REFUS_SEA',
       'NBJ_CNT_AGENCE_DEPOT_6M', 'NBJ_CNT_AGENCE_VENTE_6M',
       'NBJ_CNT_RDV_AGENCE_6M'],
      dtype='object')