In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from scipy.stats import skew, zscore
from datetime import datetime

In [2]:
data = pd.read_csv("Dataset_Fraude_Bancaire_complete.csv")
df = data.copy()


# Exploration des donn√©es

In [3]:
df.head()

Unnamed: 0,transaction_id,client_id,date,heure,montant,devise,type_transaction,lieu,canal,moyen_paiement,categorie_commerce,device_utilise,score_risque_client,is_fraud
0,T000001,C8270,01/07/2025,22:38:02,9662.17,USD,paiement en ligne,F√®s,carte,Visa,√©lectronique,Android,5,0
1,T000002,C1860,01/07/2025,09:09:18,7622.33,MAD,retrait,Tanger,application mobile,Visa,voyage,ATM,9,0
2,T000003,C6390,01/07/2025,13:33:32,9510.19,MAD,retrait,Casablanca,application mobile,Visa,voyage,ATM,9,0
3,T000004,C6191,01/07/2025,22:54:27,7040.28,MAD,virement,Casablanca,guichet,Amex,sant√©,Android,6,0
4,T000005,C6734,01/07/2025,17:06:28,3015.62,MAD,virement,F√®s,guichet,Amex,√©lectronique,iOS,1,0


In [4]:
df.shape

(1000, 14)

In [5]:
print('Class distribution of is_fraud:')
fraud_counts = df['is_fraud'].value_counts()
print(fraud_counts)

fraud_percentages = df['is_fraud'].value_counts(normalize=True) * 100
print('\nClass distribution (percentages):')
print(fraud_percentages)

Class distribution of is_fraud:
is_fraud
0    950
1     50
Name: count, dtype: int64

Class distribution (percentages):
is_fraud
0    95.0
1     5.0
Name: proportion, dtype: float64


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   transaction_id       1000 non-null   object 
 1   client_id            1000 non-null   object 
 2   date                 1000 non-null   object 
 3   heure                1000 non-null   object 
 4   montant              950 non-null    float64
 5   devise               950 non-null    object 
 6   type_transaction     1000 non-null   object 
 7   lieu                 948 non-null    object 
 8   canal                1000 non-null   object 
 9   moyen_paiement       950 non-null    object 
 10  categorie_commerce   1000 non-null   object 
 11  device_utilise       1000 non-null   object 
 12  score_risque_client  1000 non-null   int64  
 13  is_fraud             1000 non-null   int64  
dtypes: float64(1), int64(2), object(11)
memory usage: 109.5+ KB


In [7]:
df.describe()

Unnamed: 0,montant,score_risque_client,is_fraud
count,950.0,1000.0,1000.0
mean,5019.591295,5.468,0.05
std,2883.15995,2.862371,0.218054
min,50.12,1.0,0.0
25%,2511.585,3.0,0.0
50%,5064.445,5.5,0.0
75%,7548.5475,8.0,0.0
max,9988.98,10.0,1.0


In [8]:
df['client_id'].value_counts()
df.nunique()

transaction_id         1000
client_id               947
date                     93
heure                   993
montant                 949
devise                    3
type_transaction          3
lieu                      5
canal                     4
moyen_paiement            4
categorie_commerce        5
device_utilise            4
score_risque_client      10
is_fraud                  2
dtype: int64

In [9]:
df['devise'].value_counts()
df['device_utilise'].value_counts()
# df['categorie_commerce'].value_counts()
# df['moyen_paiement'].value_counts()
# df['canal'].value_counts()
# df['lieu'].value_counts()
# df['type_transaction'].value_counts()

device_utilise
ATM        270
Web        256
iOS        247
Android    227
Name: count, dtype: int64

# D√âTECTION AUTOMATIQUE DES TYPES DE VARIABLES
---




In [10]:
def detect_variable_types(df):
    """
    Cette fonction d√©tecte automatiquement les types de variables
    dans le DataFrame fourni.

    Elle distingue :
    - les identifiants (colonnes contenant 'id')
    - les variables temporelles (date, time)
    - les variables num√©riques
    - les variables cat√©gorielles
    """

    # Colonnes contenant "id" dans leur nom ‚Üí identifiants
    id_cols = [c for c in df.columns if 'id' in c.lower() ]

    # Colonnes temporelles : type datetime ou nom contenant "date" ou "time" et de type 'object'
    # Ensure we only pick actual datetime dtypes OR object dtypes with date/time keywords
    time_cols = [c for c in df.columns
                 if (pd.api.types.is_datetime64_any_dtype(df[c]))  # Use pandas specific datetime dtype check
                 or (df[c].dtype == 'object' and ('date' in c.lower() or 'time' in c.lower() or 'temps' in c.lower() or 'heure' in c.lower()))]

    # Colonnes num√©riques
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Colonnes cat√©gorielles (objets ou cat√©gories)
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # On enl√®ve les colonnes d√©j√† class√©es ailleurs
    num_cols = [c for c in num_cols if c not in id_cols + time_cols]
    cat_cols = [c for c in cat_cols if c not in id_cols + time_cols]

    # Retour sous forme de dictionnaire
    return {
        'identifiants': id_cols,
        'temporelles': time_cols,
        'num√©riques': num_cols,
        'cat√©gorielles': cat_cols
    }

In [11]:
types= detect_variable_types(df)
types

{'identifiants': ['transaction_id', 'client_id'],
 'temporelles': ['date', 'heure'],
 'num√©riques': ['montant', 'score_risque_client', 'is_fraud'],
 'cat√©gorielles': ['devise',
  'type_transaction',
  'lieu',
  'canal',
  'moyen_paiement',
  'categorie_commerce',
  'device_utilise']}

In [12]:
lenght_cols = 0
for key, value in types.items():
    print(f"{key}: {value}")
    lenght_cols += len(value)
print(f"Nombre total de colonnes : {lenght_cols}")

identifiants: ['transaction_id', 'client_id']
temporelles: ['date', 'heure']
num√©riques: ['montant', 'score_risque_client', 'is_fraud']
cat√©gorielles: ['devise', 'type_transaction', 'lieu', 'canal', 'moyen_paiement', 'categorie_commerce', 'device_utilise']
Nombre total de colonnes : 14


# ANALYSE DES VALEURS MANQUANTES
---


In [13]:
def analyse_nan(df):
    """
    Calcule le pourcentage de valeurs manquantes pour chaque colonne
    et sugg√®re une m√©thode de traitement adapt√©e.
    """

    # Calcul du taux de valeurs manquantes
    nan_info = pd.DataFrame({
        'taux_manquant_%': df.isna().mean() * 100
    })

    # Here we add a new column to the DataFrame called 'suggestion', & we give it a default value 'OK'
    nan_info['suggestion'] = 'OK'

    # Boucle sur chaque colonne pour donner une recommandation

    # .loc : "locate by label", find a specific cell using its .loc[row_label, column_label]
    for col in df.columns:
      taux = nan_info.loc[col, 'taux_manquant_%']  # Extracting the percentage of missing values of the current column
      if df[col].dtype in [np.float64, np.int64]:
            nan_info.loc[col, 'suggestion'] = 'Imputer (moyenne / m√©diane / KNN)'
      else:
            nan_info.loc[col, 'suggestion'] = 'Imputer (mode / KNN)'

    return nan_info

In [14]:
analyse_nan(df)

Unnamed: 0,taux_manquant_%,suggestion
transaction_id,0.0,Imputer (mode / KNN)
client_id,0.0,Imputer (mode / KNN)
date,0.0,Imputer (mode / KNN)
heure,0.0,Imputer (mode / KNN)
montant,5.0,Imputer (moyenne / m√©diane / KNN)
devise,5.0,Imputer (mode / KNN)
type_transaction,0.0,Imputer (mode / KNN)
lieu,5.2,Imputer (mode / KNN)
canal,0.0,Imputer (mode / KNN)
moyen_paiement,5.0,Imputer (mode / KNN)


# ENCODAGE DES VARIABLES CAT√âGORIELLES
---



In [15]:
def encode_categorical(df, cat_cols):
    """
    Encode automatiquement les variables cat√©gorielles selon leur nature :
    - One-Hot Encoding : pour les variables nominales (pas d‚Äôordre)
    - Ordinal Encoding : pour les variables ordinales (avec un ordre logique)
    - Frequency Encoding : pour les variables avec beaucoup de cat√©gories
    """
    df_encoded = df.copy()

    for col in cat_cols:
        n_unique = df[col].nunique()
        col_lower = col.lower()

        # ---- 1Ô∏è‚É£ D√©tection des variables ordinales (avec ordre)  ----

        # On cherche les mots-cl√©s pour detecter les vars ordinales.
        if any(word in col_lower for word in ['niveau', 'score', 'grade', 'rang']):
            print(f"üî¢ Encodage ordinal appliqu√© √† {col}")
            df_encoded[col] = df[col].astype('category').cat.codes  # Encodage ordinal

        # ---- 2Ô∏è‚É£ Si peu de modalit√©s ‚Üí One-Hot Encoding ----
        elif n_unique <= 10:
            print(f"üè∑Ô∏è One-Hot Encoding appliqu√© √† {col}")
            df_encoded = pd.get_dummies(df_encoded, columns=[col], drop_first=True, dtype = int)

        # ---- 3Ô∏è‚É£ Si beaucoup de modalit√©s ‚Üí Frequency Encoding ----
        else:
            print(f"üìä Frequency Encoding appliqu√© √† {col}")
            freq = df[col].value_counts(normalize=True)  # fr√©quence relative
            df_encoded[col] = df[col].map(freq)

    return df_encoded

In [16]:
encode_categorical(df,cat_cols=detect_variable_types(df)['cat√©gorielles'])

üè∑Ô∏è One-Hot Encoding appliqu√© √† devise
üè∑Ô∏è One-Hot Encoding appliqu√© √† type_transaction
üè∑Ô∏è One-Hot Encoding appliqu√© √† lieu
üè∑Ô∏è One-Hot Encoding appliqu√© √† canal
üè∑Ô∏è One-Hot Encoding appliqu√© √† moyen_paiement
üè∑Ô∏è One-Hot Encoding appliqu√© √† categorie_commerce
üè∑Ô∏è One-Hot Encoding appliqu√© √† device_utilise


Unnamed: 0,transaction_id,client_id,date,heure,montant,score_risque_client,is_fraud,devise_MAD,devise_USD,type_transaction_retrait,...,moyen_paiement_Cash,moyen_paiement_MasterCard,moyen_paiement_Visa,categorie_commerce_mode,categorie_commerce_sant√©,categorie_commerce_voyage,categorie_commerce_√©lectronique,device_utilise_Android,device_utilise_Web,device_utilise_iOS
0,T000001,C8270,01/07/2025,22:38:02,9662.17,5,0,0,1,0,...,0,0,1,0,0,0,1,1,0,0
1,T000002,C1860,01/07/2025,09:09:18,7622.33,9,0,1,0,1,...,0,0,1,0,0,1,0,0,0,0
2,T000003,C6390,01/07/2025,13:33:32,9510.19,9,0,1,0,1,...,0,0,1,0,0,1,0,0,0,0
3,T000004,C6191,01/07/2025,22:54:27,7040.28,6,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,T000005,C6734,01/07/2025,17:06:28,3015.62,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T000996,C6232,30/09/2025,16:57:22,1562.88,4,0,1,0,1,...,0,0,1,0,1,0,0,0,1,0
996,T000997,C6797,30/09/2025,00:26:24,4610.32,9,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
997,T000998,C5926,30/09/2025,22:09:46,,10,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
998,T000999,C7016,30/09/2025,22:20:20,983.08,2,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


# EXTRACTION DES CARACT√âRISTIQUES TEMPORELLES
---



In [17]:
def extract_time_features(df):
    """
    √Ä partir des colonnes 'date' et 'heure', cr√©e de nouvelles colonnes :
    - jour, mois, jour_semaine, is_weekend, heure_num, p√©riode_jour
    """
    df_time = df.copy()

    # Combine 'date' and 'heure' columns and convert to datetime
    # Handle potential errors during conversion
    df_time['datetime'] = pd.to_datetime(df_time['date'] + ' ' + df_time['heure'], errors='coerce', format='%d/%m/%Y %H:%M:%S')

    # Drop original 'date' and 'heure' columns if they exist
    df_time.drop(columns=['date', 'heure'], inplace=True, errors='ignore')

    # Extract new time features
    df_time['jour'] = df_time['datetime'].dt.day
    df_time['mois'] = df_time['datetime'].dt.month
    df_time['jour_semaine'] = df_time['datetime'].dt.weekday
    df_time['is_weekend'] = (df_time['datetime'].dt.weekday >= 5).astype(int) # Convert boolean to int
    df_time['heure_num'] = df_time['datetime'].dt.hour

    # Define time periods
    bins = [0, 6, 12, 18, 24]
    labels = ['Nuit', 'Matin', 'Apr√®s-midi', 'Soir']
    df_time['p√©riode_jour'] = pd.cut(df_time['heure_num'], bins=bins, labels=labels, right=False, include_lowest=True)

    # Drop the intermediate 'datetime' column
    df_time.drop(columns=['datetime'], inplace=True, errors='ignore')

    return df_time

In [18]:
time_ft= extract_time_features(df)
time_ft.head()

Unnamed: 0,transaction_id,client_id,montant,devise,type_transaction,lieu,canal,moyen_paiement,categorie_commerce,device_utilise,score_risque_client,is_fraud,jour,mois,jour_semaine,is_weekend,heure_num,p√©riode_jour
0,T000001,C8270,9662.17,USD,paiement en ligne,F√®s,carte,Visa,√©lectronique,Android,5,0,1,7,1,0,22,Soir
1,T000002,C1860,7622.33,MAD,retrait,Tanger,application mobile,Visa,voyage,ATM,9,0,1,7,1,0,9,Matin
2,T000003,C6390,9510.19,MAD,retrait,Casablanca,application mobile,Visa,voyage,ATM,9,0,1,7,1,0,13,Apr√®s-midi
3,T000004,C6191,7040.28,MAD,virement,Casablanca,guichet,Amex,sant√©,Android,6,0,1,7,1,0,22,Soir
4,T000005,C6734,3015.62,MAD,virement,F√®s,guichet,Amex,√©lectronique,iOS,1,0,1,7,1,0,17,Apr√®s-midi


In [19]:
time_ft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   transaction_id       1000 non-null   object  
 1   client_id            1000 non-null   object  
 2   montant              950 non-null    float64 
 3   devise               950 non-null    object  
 4   type_transaction     1000 non-null   object  
 5   lieu                 948 non-null    object  
 6   canal                1000 non-null   object  
 7   moyen_paiement       950 non-null    object  
 8   categorie_commerce   1000 non-null   object  
 9   device_utilise       1000 non-null   object  
 10  score_risque_client  1000 non-null   int64   
 11  is_fraud             1000 non-null   int64   
 12  jour                 1000 non-null   int32   
 13  mois                 1000 non-null   int32   
 14  jour_semaine         1000 non-null   int32   
 15  is_weekend           1

In [20]:
time_ft.nunique()

transaction_id         1000
client_id               947
montant                 949
devise                    3
type_transaction          3
lieu                      5
canal                     4
moyen_paiement            4
categorie_commerce        5
device_utilise            4
score_risque_client      10
is_fraud                  2
jour                     31
mois                      4
jour_semaine              7
is_weekend                2
heure_num                23
p√©riode_jour              4
dtype: int64

In [21]:
time_ft['jour'].value_counts()
time_ft['jour_semaine'].value_counts()
time_ft['heure_num'].value_counts()
time_ft['mois'].value_counts()

mois
7     337
8     337
9     325
10      1
Name: count, dtype: int64

# CHOIX AUTOMATIQUE DU SCALING
---



In [22]:
def choisir_scaling_auto(df, col):
    """
    Choisit automatiquement le type de mise √† l‚Äô√©chelle (scaling) :
    - MinMaxScaler si la variable est fortement asym√©trique (skewness > 1)
    - StandardScaler sinon
    """
    data = df[col].dropna()
    sk = skew(data)

    if abs(sk) > 0.8:
        return MinMaxScaler()
    else: #distribution normal
        return StandardScaler()

In [23]:
choisir_scaling_auto(df,'montant')

# D√âTECTION ET SUPPRESSION DES OUTLIERS (VALEURS ABERRANTES)
---



In [24]:
df.shape

(1000, 14)

In [25]:
def remove_outliers(df, num_cols, z_thresh=3):
    """
    Supprime les valeurs extr√™mes (outliers) selon le z-score.
    Si la valeur est √† plus de 3 √©carts-types de la moyenne, on la supprime.
    """
    df_clean = df.copy()

    for col in num_cols:
        if df_clean[col].dtype in [np.float64, np.int64]:
            df_clean = df_clean[(np.abs(zscore(df_clean[col].fillna(0))) < z_thresh)]

    return df_clean


In [26]:
remove_outliers(df,detect_variable_types(df)['num√©riques'])

Unnamed: 0,transaction_id,client_id,date,heure,montant,devise,type_transaction,lieu,canal,moyen_paiement,categorie_commerce,device_utilise,score_risque_client,is_fraud
0,T000001,C8270,01/07/2025,22:38:02,9662.17,USD,paiement en ligne,F√®s,carte,Visa,√©lectronique,Android,5,0
1,T000002,C1860,01/07/2025,09:09:18,7622.33,MAD,retrait,Tanger,application mobile,Visa,voyage,ATM,9,0
2,T000003,C6390,01/07/2025,13:33:32,9510.19,MAD,retrait,Casablanca,application mobile,Visa,voyage,ATM,9,0
3,T000004,C6191,01/07/2025,22:54:27,7040.28,MAD,virement,Casablanca,guichet,Amex,sant√©,Android,6,0
4,T000005,C6734,01/07/2025,17:06:28,3015.62,MAD,virement,F√®s,guichet,Amex,√©lectronique,iOS,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,T000995,C6083,30/09/2025,11:50:36,7515.14,MAD,retrait,Rabat,application mobile,Amex,mode,Web,7,0
995,T000996,C6232,30/09/2025,16:57:22,1562.88,MAD,retrait,Casablanca,application mobile,Visa,sant√©,Web,4,0
996,T000997,C6797,30/09/2025,00:26:24,4610.32,EUR,paiement en ligne,Marrakech,carte,MasterCard,alimentaire,iOS,9,0
997,T000998,C5926,30/09/2025,22:09:46,,EUR,paiement en ligne,F√®s,en ligne,Amex,sant√©,ATM,10,0


# PIPELINE COMPLET DE PR√âTRAITEMENT
---



In [27]:
def preprocess_pipeline(df):
    """
    Fonction principale : ex√©cute toutes les √©tapes de pr√©traitement
    dans le bon ordre pour produire un dataset propre et pr√™t √† l‚Äôemploi.

    """

    processed_df = df.copy()
    print("Starting preprocessing pipeline...")

    # Initial type detection
    types = detect_variable_types(processed_df)
    print(f"Initial variable types detected: {types}")
    print(f"Number of columns: {len(processed_df.columns)}")

    # Store ID columns before dropping them (optional, if IDs are needed later)
    id_cols_data = processed_df[types['identifiants']].copy()

    # ---- 1. Supprimer les colonnes identifiants ----
    if types['identifiants']:
        print(f"Dropping identifier columns: {types['identifiants']}")
        processed_df.drop(columns=types['identifiants'], inplace=True, errors='ignore')

    # ---- 2. Extraire les caract√©ristiques temporelles ----
    print("Extracting temporal features...")
    processed_df = extract_time_features(processed_df)

    # Update variable types after adding time features
    types = detect_variable_types(processed_df)
    print(f"Updated variable types after temporal feature extraction: {types}")

    # ---- 3. Analyse et imputation des valeurs manquantes ----
    print("Imputing missing values...")
    for col in processed_df.columns:
        if processed_df[col].isna().any():
            if processed_df[col].dtype in [np.float64, np.int64]:
                processed_df[col] = processed_df[col].fillna(processed_df[col].median())
                # print(f"  Filled missing numerical values in '{col}' with median.")
            else:
                processed_df[col] = processed_df[col].fillna(processed_df[col].mode().iloc[0])
                # print(f"  Filled missing categorical values in '{col}' with mode.")

    # # Ensure all numerical columns are considered for outlier removal, excluding the target
    # numerical_cols_for_outliers = [col for col in types['num√©riques'] if col not in ['is_weekend']]

    # # # ---- 4. Suppression des outliers ----
    # if numerical_cols_for_outliers:
    #     print(f"Removing outliers from numerical columns: {numerical_cols_for_outliers}...")
    #     initial_rows = processed_df.shape[0]
    #     processed_df = remove_outliers(processed_df, numerical_cols_for_outliers)
    #     print(f"  Removed {initial_rows - processed_df.shape[0]} rows due to outliers. New shape: {processed_df.shape}")

    # ---- 5. Encodage des variables cat√©gorielles ----
    if types['cat√©gorielles']:
        print(f"Encoding categorical columns: {types['cat√©gorielles']}...")
        processed_df = encode_categorical(processed_df, types['cat√©gorielles'])

    # Re-detect numerical types after encoding, as some might have been created (e.g., one-hot encoded cols are numeric)
    types_after_encoding = detect_variable_types(processed_df)

    # Define columns for scaling. These are the current numerical columns, excluding the target and any IDs
    cols_to_exclude_from_scaling = ['is_fraud', 'is_weekend']
    features_to_scale_numerically = [col for col in types['num√©riques'] if col not in cols_to_exclude_from_scaling]
    # features_to_scale_numerically = [col for col in types_after_encoding['num√©riques'] if col not in cols_to_exclude_from_scaling]

    # ---- 6. Mise √† l‚Äô√©chelle (scaling) ----
    if features_to_scale_numerically:
        print(f"Scaling numerical columns: {', '.join(features_to_scale_numerically)}...")
        for col in features_to_scale_numerically:
            if col in processed_df.columns: # Check if column still exists after previous steps
                scaler = choisir_scaling_auto(processed_df, col)
                processed_df[col] = scaler.fit_transform(processed_df[[col]])
            else:
                print(f"Warning: Column '{col}' not found in DataFrame for scaling.")
    else:
        print("No numerical columns to scale.")

    print("\n‚úÖ Preprocessing completed successfully.")
    return processed_df

In [28]:
df_clean_prep = preprocess_pipeline(df)

print("Taille du dataset pr√©trait√© apr√®s correction et suppression des outliers :\n")
print(df_clean_prep.shape)
print("\nAper√ßu du jeu de donn√©es transform√© :")
print("\nDataFrame saved to clean_preprocessed_data_fraude.csv")
print("\nAper√ßu du jeu de --/ transform√© :")
df_clean_prep.head()

Starting preprocessing pipeline...
Initial variable types detected: {'identifiants': ['transaction_id', 'client_id'], 'temporelles': ['date', 'heure'], 'num√©riques': ['montant', 'score_risque_client', 'is_fraud'], 'cat√©gorielles': ['devise', 'type_transaction', 'lieu', 'canal', 'moyen_paiement', 'categorie_commerce', 'device_utilise']}
Number of columns: 14
Dropping identifier columns: ['transaction_id', 'client_id']
Extracting temporal features...
Updated variable types after temporal feature extraction: {'identifiants': [], 'temporelles': [], 'num√©riques': ['montant', 'score_risque_client', 'is_fraud', 'jour', 'mois', 'jour_semaine', 'is_weekend', 'heure_num'], 'cat√©gorielles': ['devise', 'type_transaction', 'lieu', 'canal', 'moyen_paiement', 'categorie_commerce', 'device_utilise', 'p√©riode_jour']}
Imputing missing values...
Encoding categorical columns: ['devise', 'type_transaction', 'lieu', 'canal', 'moyen_paiement', 'categorie_commerce', 'device_utilise', 'p√©riode_jour']...


Unnamed: 0,montant,score_risque_client,is_fraud,jour,mois,jour_semaine,is_weekend,heure_num,devise_MAD,devise_USD,...,categorie_commerce_mode,categorie_commerce_sant√©,categorie_commerce_voyage,categorie_commerce_√©lectronique,device_utilise_Android,device_utilise_Web,device_utilise_iOS,p√©riode_jour_Matin,p√©riode_jour_Apr√®s-midi,p√©riode_jour_Soir
0,1.652133,-0.163583,0,-1.671261,-1.213195,-1.001877,0,1.651392,0,1,...,0,0,0,1,1,0,0,0,0,1
1,0.925874,1.234559,0,-1.671261,-1.213195,-1.001877,0,-0.299189,1,0,...,0,0,1,0,0,0,0,1,0,0
2,1.598022,1.234559,0,-1.671261,-1.213195,-1.001877,0,0.30099,1,0,...,0,0,1,0,0,0,0,0,1,0
3,0.718642,0.185953,0,-1.671261,-1.213195,-1.001877,0,1.651392,1,0,...,0,1,0,0,1,0,0,0,0,1
4,-0.714287,-1.561725,0,-1.671261,-1.213195,-1.001877,0,0.901169,1,0,...,0,0,0,1,0,0,1,0,1,0


In [29]:
print('Class distribution of is_fraud:')
fraud_counts = df['is_fraud'].value_counts()
print(fraud_counts)

fraud_percentages = df['is_fraud'].value_counts(normalize=True) * 100
print('\nClass distribution (percentages):')
print(fraud_percentages)
#After preprocessing, inckuding outliers removal:

print('\nClass distribution of is_fraud after preprocessing:')
fraud_counts = df_clean_prep['is_fraud'].value_counts()
print(fraud_counts)

fraud_percentages = df_clean_prep['is_fraud'].value_counts(normalize=True) * 100
print('\nClass distribution (percentages) after preprocessing:')
print(fraud_percentages)




Class distribution of is_fraud:
is_fraud
0    950
1     50
Name: count, dtype: int64

Class distribution (percentages):
is_fraud
0    95.0
1     5.0
Name: proportion, dtype: float64

Class distribution of is_fraud after preprocessing:
is_fraud
0    950
1     50
Name: count, dtype: int64

Class distribution (percentages) after preprocessing:
is_fraud
0    95.0
1     5.0
Name: proportion, dtype: float64


In [30]:
df_clean_prep.to_csv('clean_preprocessed_data_fraude.csv', index = False)
print("DataFrame saced to clean_preprocessed_data_fraude.csv")

DataFrame saced to clean_preprocessed_data_fraude.csv
