In [163]:
import pandas as pd
import numpy as np

from math import radians, sin, cos, sqrt, atan2

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [164]:
df = pd.read_csv("src/fraud test.csv")
df.describe(include="all")

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719,555719.0,555719,555719,555719.0,555719,555719,555719,555719,...,555719.0,555719.0,555719.0,555719,555719,555719,555719.0,555719.0,555719.0,555719.0
unique,,226976,,693,14,,341,471,2,924,...,,,,478,910,555719,,,,
top,,15/12/2020 21:26,,fraud_Kilback LLC,gas_transport,,Christopher,Smith,F,444 Robert Mews,...,,,,Film/video editor,23/03/1977,2da90c7d74bd46a0caf3777415b3ebd3,,,,
freq,,16,,1859,56370,,11443,12146,304886,1474,...,,,,4119,2408,1,,,,
mean,277859.0,,4.178387e+17,,,69.39281,,,,,...,38.543253,-90.231325,88221.89,,,,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,,1.309837e+18,,,156.745941,,,,,...,5.061336,13.72178,300390.9,,,,5201104.0,5.095829,13.733071,0.062008
min,0.0,,60416210000.0,,,1.0,,,,,...,20.0271,-165.6723,23.0,,,,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,,180043000000000.0,,,9.63,,,,,...,34.6689,-96.798,741.0,,,,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,,3521420000000000.0,,,47.29,,,,,...,39.3716,-87.4769,2408.0,,,,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,,4635330000000000.0,,,83.01,,,,,...,41.8948,-80.1752,19685.0,,,,1385867000.0,41.954163,-80.264637,0.0


In [165]:
# Séparation des données en deux dataset selon le coté frauduleux de la transaction
df_frauds = df[df['is_fraud'] == 1]
sample = df[df['is_fraud'] == 0]

# Échantillonnage du dataset de transactions non frauduleuses pour avoir la même taille que le dataset de transactions frauduleuses
dsam = sample.sample(2145, random_state=0)

# Fusion des deux datasets
frames = [df_frauds, dsam]
result = pd.concat(frames)

# Mélange des données dans le dataset
result = result.sample(frac = 1)
result.shape

(4290, 23)

In [166]:
result.describe(include="all")

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
count,4290.0,4290,4290.0,4290,4290,4290.0,4290,4290,4290,4290,...,4290.0,4290.0,4290.0,4290,4290,4290,4290.0,4290.0,4290.0,4290.0
unique,,4237,,680,14,,309,430,2,798,...,,,,445,786,4290,,,,
top,,30/06/2020 23:16,,fraud_Kilback LLC,grocery_pos,,Christopher,Williams,F,444 Robert Mews,...,,,,Science writer,22/09/1997,140105375e97c577a1a0d363e4cedcd4,,,,
freq,,3,,26,693,,119,106,2352,24,...,,,,44,24,1,,,,
mean,261787.03007,,3.810071e+17,,,299.209186,,,,,...,38.783911,-90.376413,82394.83,,,,1380186000.0,38.784218,-90.380031,0.5
std,149535.273281,,1.261528e+18,,,374.387217,,,,,...,5.025407,13.864927,276430.7,,,,4910323.0,5.052313,13.871188,0.500058
min,370.0,,60416210000.0,,,1.02,,,,,...,20.0271,-165.6723,23.0,,,,1371824000.0,19.161782,-166.033127,0.0
25%,135552.0,,38859500000000.0,,,20.39,,,,,...,34.9298,-96.7456,861.0,,,,1375913000.0,34.975099,-96.777858,0.0
50%,259887.0,,3500170000000000.0,,,88.58,,,,,...,39.5994,-87.8235,2676.0,,,,1380066000.0,39.589137,-87.703727,0.5
75%,385890.5,,4452370000000000.0,,,467.1275,,,,,...,42.0765,-80.158,15647.0,,,,1384815000.0,42.049737,-80.108471,1.0


In [167]:
# Définition des colonnes à enlever pour le modèle de clustering
to_drop = ['cc_num', 'first', 'last', 'street', 'city' ,'state', 'zip', 'trans_num']
clean_data = result.drop(columns=to_drop)

In [168]:
# Définition de la fonction de calcul de la distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Rayon de la Terre en kilomètres

    # Convertir les latitudes et longitudes en radians
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    # Différences de latitude et longitude
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Formule haversine
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    # Distance en kilomètres
    distance = R * c
    
    return distance
    # Assurez-vous que cette fonction est correctement indentée ici

# Calcul de la distance pour chaque ligne du DataFrame
clean_data['distance'] = clean_data.apply(lambda row: haversine_distance(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

# Affichage des premières lignes du DataFrame avec la nouvelle colonne
print(clean_data.head())

        Unnamed: 0 trans_date_trans_time               merchant  \
18509        18509      27/06/2020 22:55      fraud_Spencer PLC   
463092      463092      11/12/2020 17:13  fraud_Kirlin and Sons   
401268      401268      25/11/2020 18:57      fraud_Berge-Hills   
221271      221271      07/09/2020 22:36      fraud_Kerluke Inc   
122277      122277      03/08/2020 04:44     fraud_Miller-Hauck   

             category     amt gender      lat     long  city_pop  \
18509   entertainment  614.72      F  45.7205 -98.5534        63   
463092  personal_care   43.45      F  34.0326 -82.2027      1523   
401268      kids_pets   21.55      M  43.4512 -71.4890      7430   
221271       misc_net  788.07      M  42.2619 -94.5566       695   
122277    grocery_pos   61.95      M  44.4477 -93.4252      5211   

                                           job         dob   unix_time  \
18509                        Systems developer  30/10/1969  1372373713   
463092  Research scientist (physical sci

In [169]:
clean_data.describe(include="all")

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud,distance
count,4290.0,4290,4290,4290,4290.0,4290,4290.0,4290.0,4290.0,4290,4290,4290.0,4290.0,4290.0,4290.0,4290.0
unique,,4237,680,14,,2,,,,445,786,,,,,
top,,30/06/2020 23:16,fraud_Kilback LLC,grocery_pos,,F,,,,Science writer,22/09/1997,,,,,
freq,,3,26,693,,2352,,,,44,24,,,,,
mean,261787.03007,,,,299.209186,,38.783911,-90.376413,82394.83,,,1380186000.0,38.784218,-90.380031,0.5,76.41612
std,149535.273281,,,,374.387217,,5.025407,13.864927,276430.7,,,4910323.0,5.052313,13.871188,0.500058,29.209701
min,370.0,,,,1.02,,20.0271,-165.6723,23.0,,,1371824000.0,19.161782,-166.033127,0.0,1.959344
25%,135552.0,,,,20.39,,34.9298,-96.7456,861.0,,,1375913000.0,34.975099,-96.777858,0.0,56.089232
50%,259887.0,,,,88.58,,39.5994,-87.8235,2676.0,,,1380066000.0,39.589137,-87.703727,0.5,78.884395
75%,385890.5,,,,467.1275,,42.0765,-80.158,15647.0,,,1384815000.0,42.049737,-80.108471,1.0,99.072109


In [170]:
mean_distance_fraudulent = clean_data.loc[clean_data["is_fraud"] == 1, "distance"].mean()

print("Moyenne de la distance pour les transactions frauduleuses :", mean_distance_fraudulent)

Moyenne de la distance pour les transactions frauduleuses : 76.21390498710234


In [171]:
# Sélection des features nécessaires pour le clustering
data_geo = clean_data[["merch_lat", "merch_long", "is_fraud"]]

In [172]:
# Première répartition des transactions

fig = px.scatter_mapbox(
        data_geo,
        lat="merch_lat",
        lon="merch_long",
        color="is_fraud",
        mapbox_style="carto-positron"
)

fig.show()

# Standardisation des données

In [173]:
num_feat = [0, 1, 2]
num_trans = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_trans, num_feat)])

X= preprocessor.fit_transform(data_geo)
print(X[0:5, :])

[[ 1.21473115 -0.6405943   1.        ]
 [-0.79824683  0.55277806 -1.        ]
 [ 0.754653    1.30109585  1.        ]
 [ 0.72773902 -0.26499423  1.        ]
 [ 1.25403733 -0.29055781 -1.        ]]


In [174]:
# Création du DBSCAN
db = DBSCAN(eps=0.7, min_samples=200, metric="manhattan")
db.fit(X)

data_geo['cluster'] = db.labels_



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [175]:
np.unique(db.labels_, return_counts=True)

(array([-1,  0,  1], dtype=int64), array([ 838, 1701, 1751], dtype=int64))

In [176]:
fig = px.scatter_mapbox(
        data_geo[data_geo.cluster != -1],
        lat="merch_lat",
        lon="merch_long",
        color="is_fraud",
        mapbox_style="carto-positron"
)

fig.show()

In [177]:
import plotly.express as px

# Créer une palette de couleurs personnalisée avec le rouge pour les valeurs de is_fraud égales à 1
color_scale = px.colors.diverging.RdYlBu_r

# Modifier la couleur pour les valeurs de is_fraud égales à 1 en rouge
color_scale[0] = 'blue'

# Créer la carte avec les couleurs modifiées
fig = px.scatter_mapbox(
    data_geo.loc[data_geo.cluster != -1, :],
    lat="merch_lat",
    lon="merch_long",
    color="is_fraud",
    mapbox_style="carto-positron",
    color_continuous_scale=color_scale
)

# Afficher la carte
fig.show()


In [178]:
df_fr = data_geo[data_geo['is_fraud']==1]

## Silhouette

In [179]:
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []
k = []

## Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels
for i in range (2,31):
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init = 'auto')
    kmeans.fit(df_fr)
    sil.append(silhouette_score(df_fr, kmeans.predict(df_fr)))
    k.append(i)

# Create a data frame
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,
             x=k,
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)
fig.show() # if using workspace

In [180]:
best_k_index = np.argmax(sil)

# Valeur de k correspondant au score de silhouette le plus élevé
best_k = k[best_k_index]  # Remplacer par le nombre optimal de clusters déterminé par votre analyse de silhouette
kmeans = KMeans(n_clusters=best_k, random_state=0)
kmeans.fit(df_fr)

# Prédire les clusters pour chaque échantillon
df_fr['cluster'] = kmeans.labels_

# Définir le centre de la carte et le niveau de zoom pour afficher toutes les données
center_longitude = df_fr['merch_long'].mean()  # Utiliser la moyenne des longitudes comme centre
center_latitude = df_fr['merch_lat'].mean()  # Utiliser la moyenne des latitudes comme centre
zoom_level = 2  # Niveau de zoom pour afficher toutes les données

# Créer une carte interactive avec Plotly Express en spécifiant le centre et le zoom
fig = px.scatter_mapbox(df_fr, lon='merch_long', lat='merch_lat', color='cluster', 
                        mapbox_style="carto-positron", title='Clusters des données géographiques selon la méthode Silhouette',
                        center=dict(lon=center_longitude, lat=center_latitude), zoom=zoom_level)

# Afficher la carte interactive
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



On peut voir grâce à la méthode Silhouette que la répartition en clusters s'est faite de manière arbitraire en coupant les données en deux. Cependant ça ne nous permet pas de déduire quelque chose 

## ELBOW

In [181]:
# Instanciate KMeans with k=3 and initialisation with k-means++
# You should always use k-means++ as it alleviate the problem of local minimum convergence
kmeans = KMeans(n_clusters=10, random_state=0)

# Fit kmeans to our dataset
kmeans.fit(df_fr)
# Let's create a loop that will collect the Within-sum-of-square (wcss) for each value K
# Let's use .inertia_ parameter to get the within sum of square value for each value K
wcss =  []
k = []
for i in range (1,21):
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init = 'auto')
    kmeans.fit(df_fr)
    wcss.append(kmeans.inertia_)
    k.append(i)
# Create DataFrame
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)

# Render
#fig.show(renderer="notebook")
fig.show() # if using workspace

In [182]:
# Recherche du meilleur k (nombre optimal de clusters)
diffs = np.diff(wcss)  # Calculer les différences entre les valeurs de l'inertie consécutives
diffs_ratio = diffs[:-1] / diffs[1:]  # Calculer les ratios de différences

# Trouver l'indice du point où le ratio de différence commence à diminuer significativement
elbow_index = np.argmax(diffs_ratio < 0.5)  # Choisir un seuil approprié pour déterminer le point du coude

# Valeur de k correspondant au point du coude
best_k_elbow = k[elbow_index]

kmeans = KMeans(n_clusters=best_k, random_state=0)
kmeans.fit(df_fr)

# Prédire les clusters pour chaque échantillon
df_fr['cluster'] = kmeans.labels_

# Définir le centre de la carte et le niveau de zoom pour afficher toutes les données
center_longitude = df_fr['merch_long'].mean()  # Utiliser la moyenne des longitudes comme centre
center_latitude = df_fr['merch_lat'].mean()  # Utiliser la moyenne des latitudes comme centre
zoom_level = 2  # Niveau de zoom pour afficher toutes les données

# Créer une carte interactive avec Plotly Express en spécifiant le centre et le zoom
fig = px.scatter_mapbox(df_fr, lon='merch_long', lat='merch_lat', color='cluster', 
                        mapbox_style="carto-positron", title='Clusters des données géographiques selon la méthode Elbow',
                        center=dict(lon=center_longitude, lat=center_latitude), zoom=zoom_level)

# Afficher la carte interactive
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



On peut voir grâce à la méthode Elbow, que la répartition en cluster n'est pas optimale et peut s'avérer anarchique. Bien que les zones de villes ne soient découpées entre les deux clusters, cette méthode manque de précision

## DBSCAN

In [190]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) > 1:  # Vérifier s'il y a plus d'un cluster
        return silhouette_score(X, labels)
    else:
        return 0.0

# Définir la grille de paramètres à explorer
param_grid = {
    'eps': [0.005, 0.05, 0.5, 0.9, 1, 2, 3, 4],
    'min_samples': [3, 4, 5, 10, 20, 30]
}

# Créer un objet DBSCAN
dbscan = DBSCAN()

# Créer un objet GridSearchCV
grid_search = GridSearchCV(dbscan, param_grid, cv=5, scoring=silhouette_scorer)

# Exécuter la recherche de grille
grid_search.fit(df_fr)

print("Best parameters:", grid_search.best_params_)

Best parameters: {'eps': 2, 'min_samples': 3}


In [191]:
db = DBSCAN(eps=2, min_samples=3, metric="manhattan")
db.fit(df_fr)
np.unique(db.labels_, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122], dtype=int64),
 array([838,  13,  11,  33,   4,  51,  24,  38,  20,  11,   4,   7,   7,
         39,   9,   5,  10,  11,  97,  11,  21,   5,   5,  55,   4,  12,
         28,   5,  15,   4,   6,   8,   4,   5,  25,   5,  12,   7,  19,
         16,  47,   4,   9,  13,  11,   4,   8,   4,   5,  14,   7,   7

In [193]:
# Ajouter les labels de cluster à votre DataFrame
df_fr['cluster'] = db.labels_

# Définir le centre de la carte et le niveau de zoom pour afficher toutes les données
center_longitude = df_fr['merch_long'].mean()  # Utiliser la moyenne des longitudes comme centre
center_latitude = df_fr['merch_lat'].mean()  # Utiliser la moyenne des latitudes comme centre
zoom_level = 2.5  # Niveau de zoom pour afficher toutes les données

# Créer une carte interactive avec Plotly Express en spécifiant le centre et le zoom
fig = px.scatter_mapbox(df_fr[df_fr.cluster != -1], lon='merch_long', lat='merch_lat', color='cluster', 
                        mapbox_style="carto-positron", title='Clusters des zones de transactions frauduleuses selon la méthode DBSCAN',
                        center=dict(lon=center_longitude, lat=center_latitude), zoom=zoom_level)

# Afficher la carte interactive
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



On peut voir, grâce au dbscan, que les transactions frauduleuses sont réparties principalement autour de grandes villes. Cependant la côte Est est beaucoup plus touchée par le phénomène que la côte Ouest.

## Prédiction de l'emplacement géographique des transactions frauduleuses

In [186]:
clean_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud,distance
18509,18509,27/06/2020 22:55,fraud_Spencer PLC,entertainment,614.72,F,45.7205,-98.5534,63,Systems developer,30/10/1969,1372373713,44.920705,-99.264799,1,104.893208
463092,463092,11/12/2020 17:13,fraud_Kirlin and Sons,personal_care,43.45,F,34.0326,-82.2027,1523,Research scientist (physical sciences),03/06/1984,1386782005,34.751695,-82.713236,0,92.671257
401268,401268,25/11/2020 18:57,fraud_Berge-Hills,kids_pets,21.55,M,43.4512,-71.489,7430,"Scientist, research (medical)",13/01/1955,1385405834,42.596517,-72.334389,1,117.279049
221271,221271,07/09/2020 22:36,fraud_Kerluke Inc,misc_net,788.07,M,42.2619,-94.5566,695,Administrator,15/07/1954,1378593388,42.460555,-94.055387,1,46.731454
122277,122277,03/08/2020 04:44,fraud_Miller-Hauck,grocery_pos,61.95,M,44.4477,-93.4252,5211,Chief Strategy Officer,27/01/1987,1375505095,45.119269,-94.409943,0,107.778843


In [187]:
to_drop = ["Unnamed: 0", "trans_date_trans_time", "unix_time", "dob", "merch_lat", "merch_long"]
data_pred = clean_data.drop(to_drop, axis=1)


In [188]:
data_pred.head()

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,is_fraud,distance
18509,fraud_Spencer PLC,entertainment,614.72,F,45.7205,-98.5534,63,Systems developer,1,104.893208
463092,fraud_Kirlin and Sons,personal_care,43.45,F,34.0326,-82.2027,1523,Research scientist (physical sciences),0,92.671257
401268,fraud_Berge-Hills,kids_pets,21.55,M,43.4512,-71.489,7430,"Scientist, research (medical)",1,117.279049
221271,fraud_Kerluke Inc,misc_net,788.07,M,42.2619,-94.5566,695,Administrator,1,46.731454
122277,fraud_Miller-Hauck,grocery_pos,61.95,M,44.4477,-93.4252,5211,Chief Strategy Officer,0,107.778843


In [189]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Supposons que vous ayez déjà défini vos caractéristiques numériques et catégorielles
num_feat = [2, 4, 5, 6, 8, 9, 10]
cat_feat = [0, 1, 3, 7]

# Créer les transformateurs pour les caractéristiques numériques et catégorielles
num_trans = StandardScaler()
cat_trans = OneHotEncoder()

# Créer le préprocesseur en utilisant ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_trans, num_feat),
        ("cat", cat_trans, cat_feat)])

# Appliquer le préprocesseur sur les données
X = preprocessor.fit_transform(data_pred)

# Afficher les premières lignes de X pour vérifier le résultat
print(X[0:5, :])

ValueError: all features must be in [0, 9] or [-10, 0]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

# Supposons que vous avez déjà préparé vos données avec des caractéristiques géographiques et la variable cible (catégorie de transaction)
y = clean_data.is_fraud  # Variable cible

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Créer un modèle de forêt aléatoire
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Entraîner le modèle
model.fit(X_train, y_train)

# Faire des prédictions sur l'ensemble de test
predictions = model.predict(X_test)

# Évaluer les performances du modèle
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
precision = precision_score(y_test, predictions)
print("Précision :", precision)
f1 = f1_score(y_test, predictions)
print("F1-score :", f1)
conf_matrix = confusion_matrix(y_test, predictions)
print("Matrice de confusion :\n", conf_matrix)

Accuracy: 0.9265734265734266
Précision : 0.964824120603015
F1-score : 0.924187725631769
Matrice de confusion :
 [[411  14]
 [ 49 384]]


In [None]:
predictions

array([1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,