In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, roc_auc_score

from imblearn.over_sampling import SMOTE

## Carregando os dados

In [6]:
df = pd.read_csv('fraud_dataset.csv')

df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [7]:
df.shape

(1000000, 8)

In [10]:
df.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


In [14]:
df.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

Verificando o desiquilibrio de classes (Fraud)

In [17]:
round(df['fraud'].value_counts(normalize=True) * 100, 2)

fraud
0.0    91.26
1.0     8.74
Name: proportion, dtype: float64

### Padronização da escala da features 

Separando primeiro a base em treino e teste para evitar data leak durante a padronização dos dados.  
Hold out (80% treino e 20% teste)

In [21]:
X = df.drop('fraud', axis=1)
y = df[['fraud']]

X.shape, y.shape

((1000000, 7), (1000000, 1))

In [23]:
from sklearn.model_selection import train_test_split

# Separação de treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y)

print(X_treino.shape, X_teste.shape, y_treino.shape, y_teste.shape)

(750000, 7) (250000, 7) (750000, 1) (250000, 1)


O hiperparâmetro stratify da função train_test_split do scikit-learn é utilizado para garantir que a divisão dos dados em treino e teste mantenha a mesma proporção das classes presentes na variável de destino. Isso é especialmente útil em problemas de classificação quando há desbalanceamento de classes

Aplicanado padronização

In [27]:
scaler = StandardScaler()

# aplicando o fit somente na base de treinamento

scaler.fit(X_treino)

In [29]:
X_train_pad = pd.DataFrame(scaler.transform(X_treino))
X_test_pad = pd.DataFrame(scaler.transform(X_teste))

X_train_pad.shape, X_test_pad.shape

((750000, 7), (250000, 7))

### Aplicação do oversampling e validação cruzada

Não se deve balancer as classes antes do split.   
Pois os splits criados no cross_val_score estão com a validação BALANCEADA.  
E isso é distorcer a realidade 



In [31]:
# função de validação cruzada NA MÃO
def validacao_cruzada(modelo, X, y, oversampling=False):
    # KFold do scikit-learn
    kfold = KFold(n_splits=5, shuffle=True, random_state= 1991)

    # lista de acuracias de cada split
    acuracias_split = []
    
    # iterando sobre os splits
    for idx, (idx_treino, idx_validacao) in enumerate(kfold.split(X)):
        X_split_treino = X.iloc[idx_treino, :]
        y_split_treino = y.iloc[idx_treino, :]
    
        # oversampling, só no split de treino!!
        if oversampling:
            sm = SMOTE(random_state=1991)
            X_split_treino, y_split_treino = sm.fit_resample(X_split_treino, y_split_treino)
        
        # Com os dados balenceados SÓ NO TREINO, vamos treinar o nosso modelo
        modelo.fit(X_split_treino, y_split_treino.values.flatten())
    
        X_split_validacao = X.iloc[idx_validacao, :]
        y_split_validacao = y.iloc[idx_validacao, :]
        
        # Validação SEM oversampling, como a cartilha ensina :)
        # Amostra do mundo real, ou seja, com dados DESBALANCEADOS
        predicoes_validacao = modelo.predict(X_split_validacao)
        
        acuracia_split = accuracy_score(y_split_validacao, predicoes_validacao)
        
        acuracias_split.append(acuracia_split)
        
        print(f'Acurácia do split {idx}: {acuracia_split}')
    
    return acuracias_split


In [33]:
# Criação do modelo conforme hiperparametro do enunciado penalty = 'l2'

modelo = LogisticRegression(penalty= 'l2', random_state= 1991)

O hiperparâmetro penalty='l2' na função LogisticRegression do scikit-learn refere-se à regularização aplicada ao modelo. Especificamente, a regularização L2, também conhecida como Ridge Regularization.  
  
penalty='l2' aplica regularização L2 (Ridge), que reduz, mas não anula, os coeficientes do modelo.

In [35]:
media_acuracia_com_smote = np.mean(validacao_cruzada(modelo, X_train_pad, y_treino, oversampling=True)) * 100

Acurácia do split 0: 0.93418
Acurácia do split 1: 0.9334933333333333
Acurácia do split 2: 0.9340266666666667
Acurácia do split 3: 0.9337266666666667
Acurácia do split 4: 0.9350133333333334


In [36]:
media_acuracia_com_smote.round(2)

93.41

In [37]:
# Agora sim aplicando oversampling

sm = SMOTE(random_state=1991)
X_treino_resampled, y_treino_resampled = sm.fit_resample(X_train_pad, y_treino)

In [38]:
# Com os dados balenceados SÓ NO TREINO, vamos treinar o nosso modelo
modelo.fit(X_treino_resampled, y_treino_resampled.values.flatten())

In [43]:
y_pred = modelo.predict(X_test_pad)

In [45]:
# Calculando a curva roc

auc_score = roc_auc_score(y_teste, y_pred)

print(classification_report(y_teste, y_pred))

print(auc_score.mean())

              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96    228346
         1.0       0.57      0.95      0.71     21654

    accuracy                           0.93    250000
   macro avg       0.78      0.94      0.84    250000
weighted avg       0.96      0.93      0.94    250000

0.9405157069996989


Aqui terminou todos os pedidos feitos pelo time 

## Implementando um ensemble LightGBM para tentar melhorar a precisão da predição de valores Fraud

In [47]:
import lightgbm as lgb

In [49]:
classificador_lgbm = lgb.LGBMClassifier()

In [51]:
media_acuracia_com_smote = np.mean(validacao_cruzada(classificador_lgbm, X_train_pad, y_treino, oversampling=True)) * 100

[LightGBM] [Info] Number of positive: 547410, number of negative: 547410
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 1094820, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Acurácia do split 0: 0.9986866666666666
[LightGBM] [Info] Number of positive: 547451, number of negative: 547451
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012282 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 896
[LightGBM] [Info] Number of data points in the train set: 1094902, number of used features: 7
[LightGBM] [Info] [binary:Boo

In [53]:
# classificador_lgbm

media_acuracia_com_smote

99.82506666666667

In [55]:
# O RandomizedSearchCV realiza uma busca aleatória em um conjunto de hiperparâmetros

from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'num_leaves': np.arange(20, 60, 10),  # Intervalo reduzido
    'learning_rate': [0.01, 0.1, 0.2],    # Apenas 3 valores testados
    'n_estimators': [100, 200, 300],       # Quantidade limitada de árvores
    'subsample': [0.8, 1],                # Menos valores de amostras por árvore
    'max_depth': [4, 6, 8]                # Apenas 3 valores de profundidade
}

In [57]:
random_search = RandomizedSearchCV(classificador_lgbm, param_distributions=param_dist, 
                                   n_iter=32, cv=5, verbose=1, n_jobs=-1, random_state=1991)

In [59]:
random_search.fit(X_treino_resampled, y_treino_resampled)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 684251, number of negative: 684251
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 884
[LightGBM] [Info] Number of data points in the train set: 1368502, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [60]:
random_search.best_params_

{'subsample': 1,
 'num_leaves': 50,
 'n_estimators': 300,
 'max_depth': 8,
 'learning_rate': 0.2}

In [63]:
classificador_lgbm_tunado = lgb.LGBMClassifier(subsample=1, num_leaves=50, n_estimators=300, max_depth= 8, learning_rate= 0.2)

In [65]:
media_acuracia_com_smote = np.mean(validacao_cruzada(classificador_lgbm_tunado, X_train_pad, y_treino, oversampling=True)) * 100

[LightGBM] [Info] Number of positive: 547410, number of negative: 547410
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 1094820, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Acurácia do split 0: 0.9987866666666667
[LightGBM] [Info] Number of positive: 547451, number of negative: 547451
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 896
[LightGBM] [Info] Number of data points in the train set: 1094902, number of used features: 7
[LightGBM] [Info] [binary:Boo

In [66]:
media_acuracia_com_smote

99.85626666666667

In [72]:
# # Com o melhor modelo, posso utilizar a base toda de treino

classificador_lgbm_tunado.fit(X_treino_resampled, y_treino_resampled)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 684251, number of negative: 684251
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 884
[LightGBM] [Info] Number of data points in the train set: 1368502, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [74]:
y_pred = classificador_lgbm_tunado.predict(X_test_pad)

In [76]:
# Calculando a curva roc

auc_score = roc_auc_score(y_teste, y_pred)

print(classification_report(y_teste, y_pred))

print(auc_score.mean())

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    228346
         1.0       0.99      0.99      0.99     21654

    accuracy                           1.00    250000
   macro avg       0.99      1.00      0.99    250000
weighted avg       1.00      1.00      1.00    250000

0.9964138040212077


# Comparação de resultados

Após rodar dois modelos de classificação na mesma base de dados, um utilizando Regressão Logística e outro com LightGBM, os resultados indicam uma performance superior do LightGBM em termos de precisão, recall e f1-score, especialmente para a classe minoritária.

### Regressão Logística:
- Classe 0.0: Precisão de 99%, recall de 93% e f1-score de 96%, com 228.346 instâncias.
- Classe 1.0: Precisão de 57%, recall de 95% e f1-score de 71%, com 21.654 instâncias.
- Acurácia global: 93%  

### LightGBM:
- Classe 0.0: Precisão de 100%, recall de 100% e f1-score de 100%, com 228.346 instâncias.
- Classe 1.0: Precisão de 99%, recall de 99% e f1-score de 99%, com 21.654 instâncias.
- Acurácia global: 100%  

Em termos de métricas, o LightGBM não apenas melhorou o recall da classe 1.0 de 95% para 99%, mas também trouxe uma precisão significativamente mais alta para essa classe, de 57% para 99%. Isso resultou em um f1-score de 99%, mostrando que o modelo é capaz de identificar com muito mais precisão as instâncias da classe minoritária sem sacrificar a qualidade das predições da classe majoritária.

O desempenho aprimorado do LightGBM faz com que ele seja uma escolha mais robusta e eficiente para este problema, principalmente devido à sua capacidade de lidar melhor com o desbalanceamento entre classes.