# **Variáveis do dataset**

**step** - mapeia uma unidade de tempo no mundo real. Neste caso, 1 passo é 1 hora de tempo. Total de etapas 744 (simulação de 30 dias).

**type** - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER. 
(caixa-de-entrada, caixa-de-saida, débito, pagamento e transferência)

**amount** - valor da transação em moeda local.

**nameOrig** - cliente que iniciou a transação

**oldbalanceOrg** - saldo inicial antes da transação

**newbalanceOrig** - novo saldo após a transação

**nameDest** - cliente que é o destinatário da transação

**oldbalanceDest** - destinatário do saldo inicial antes da transação. 
Observe que não há informações para clientes que começam com M (Comerciantes).

**newbalanceDest** - novo destinatário do saldo após a transação. Observe que não há informações para clientes que começam com M (Comerciantes).

**isFraud** - São as transações feitas pelos agentes fraudulentos dentro da simulação. Neste conjunto de dados específico, o comportamento fraudulento dos agentes visa lucrar ao assumir o controle das contas dos clientes e tentar esvaziar os fundos transferindo para outra conta e depois sacando do sistema.

**isFlaggedFraud** - O modelo de negócios visa controlar transferências massivas de uma conta para outra e sinaliza tentativas ilegais. Uma tentativa ilegal neste conjunto de dados é uma tentativa de transferir mais de 200.000 em uma única transação


# Importando os dados

In [81]:
import pandas as pd 
import numpy as np

In [82]:
# Importando dados com pandas
df = pd.read_csv("fraud_dataset_example.csv")

# Monstrando as primeiras linhas
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Tratamento de dados

### Renomeando as colunas

In [83]:
df.columns = ["tempo", 
              "tipo",
              "valor",
              "cliente_1",
              "saldo_inicial_c1",
              "novo_saldo_c1",
              "cliente_2",
              "saldo_inicial_c2",
              "novo_saldo_c2",
              "fraude",
              "possivel_fraude"]

df

Unnamed: 0,tempo,tipo,valor,cliente_1,saldo_inicial_c1,novo_saldo_c1,cliente_2,saldo_inicial_c2,novo_saldo_c2,fraude,possivel_fraude
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
101608,10,PAYMENT,7477.02,C513257306,785.00,0.00,M524833426,0.00,0.00,0,0
101609,10,CASH_OUT,282252.35,C210473293,220339.29,0.00,C1172042998,95156.01,91286.97,0,0
101610,10,PAYMENT,17289.01,C807582280,0.00,0.00,M334249577,0.00,0.00,0,0
101611,10,TRANSFER,1347146.45,C1315779140,1605.00,0.00,C1631408038,541639.43,2186953.43,0,0


### Nulos

In [None]:
df.isnull().sum()

### Na

In [None]:
df.isna().sum()

### Duplicados

In [None]:
df.duplicated().sum()

### Removendo variáveis

In [8]:
df = df.drop(columns=['cliente_1', 'cliente_2', 'possivel_fraude'])

### Get Dummies em tipo

In [None]:
df = pd.get_dummies(data=df, columns=['tipo'])

df.head()

# Analisando algumas informações

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, explorative=True)

profile.to_file("relatorio.html")

# Variáveis

### X

In [None]:
# Variáveis explicativas
x = df.drop(columns="fraude")

x

### Y

In [None]:
# Variável alvo
y = df["fraude"]

y

# Treino e Teste

In [13]:
# Importação
from sklearn.model_selection import train_test_split

# Treino e Teste
X_treino, X_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.25, stratify=y, random_state=0)

# Função para validação de modelo

In [14]:
def avaliacao(y_real, y_pred):
    from sklearn.metrics import classification_report
    from sklearn.metrics import ConfusionMatrixDisplay
    from sklearn.metrics import RocCurveDisplay
    
    print(classification_report(y_pred, y_real))
    RocCurveDisplay.from_predictions(y_pred, y_real)
    ConfusionMatrixDisplay.from_predictions(y_pred, y_real)

# Regressão logística com SMOTE

In [15]:
from imblearn.pipeline import Pipeline  # Importação correta do Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Criando pipeline corretamente
pipeline_SMOTE = Pipeline([
    ('scaler', StandardScaler()),  # Normaliza os dados
    ('oversample', SMOTE()),       # Aplica SMOTE corretamente dentro do pipeline
    ('log_reg', LogisticRegression(solver='liblinear', C=0.1))  # Modelo de regressão logística
])

# Treinando o pipeline
pipeline_SMOTE.fit(X_treino, y_treino)

previsao_LR_SMOTE = pipeline_SMOTE.predict(X_teste)

### Métricas

In [None]:
avaliacao(previsao_LR_SMOTE, y_teste)

# Regressão com undersample

In [17]:
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import StandardScaler

# Criando pipeline corretamente
pipeline_NearMiss = Pipeline([
    ('scaler', StandardScaler()),  # Normaliza os dados
    ('oversample', NearMiss()),       # Aplica SMOTE corretamente dentro do pipeline
    ('log_reg', LogisticRegression(solver='liblinear', C=0.1))  # Modelo de regressão logística
])

# Treinando o pipeline
pipeline_NearMiss.fit(X_treino, y_treino)

previsao_LR_NearMiss = pipeline_NearMiss.predict(X_teste)

### Métricas

In [None]:
avaliacao(previsao_LR_NearMiss, y_teste)

# Árvore de decisão

In [19]:
from sklearn.tree import DecisionTreeClassifier

# Modelo
pipeline_DTC = Pipeline([("SMOTE", SMOTE()), ("StanderScaler", StandardScaler()), ("DecisionTree", DecisionTreeClassifier(max_depth=5))]).fit(X_treino, y_treino)

# Previsão
previsao_DTC_SMOTE = pipeline_DTC.predict(X_teste)

# Métricas

In [None]:
avaliacao(previsao_DTC_SMOTE, y_teste)

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

# Modelo
pipeline_Rf = Pipeline([("SMOTE", SMOTE()), ("StanderScaler", StandardScaler()), ("Random Forest", RandomForestClassifier(max_depth=10))]).fit(X_treino, y_treino)

# Previsão
previsao_rf = pipeline_Rf.predict(X_teste)

In [None]:
avaliacao(previsao_rf, y_teste)

# Randomized Search CV

In [23]:
from sklearn.model_selection import RandomizedSearchCV, cross_validate, StratifiedKFold

### SKF

In [24]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

### Parametros a serem testados

In [36]:
# parametros
param_dist = {
    'n_estimators': [50, 100, 200],  # Número de árvores (não colocar valores muito altos)
    'max_depth': [10, 20, 30],  # Profundidade da árvore
    'min_samples_split': [2, 5, 10],  # Mínimo de amostras para dividir um nó
    'min_samples_leaf': [1, 2, 4],  # Mínimo de amostras em um nó folha
    'max_features': ['sqrt', 'log2'],  # Número de features consideradas na divisão
    'bootstrap': [True, False]  # Se deve usar amostragem com reposição
}

### Randomized Search CV

In [None]:
# Definição do Pipeline com SMOTE + RandomizedSearchCV
randomized = Pipeline([
    ("SMOTE", SMOTE()),
    ("RandomizedSearch", RandomizedSearchCV(
        estimator=RandomForestClassifier(random_state=0),
        param_distributions=param_dist,
        n_jobs=-1,
        cv=skf,
        n_iter=5
    ))
])

# Ajuste do modelo nos dados de treino
randomized.fit(X_treino, y_treino)

### Melhores parametros encontrados

In [84]:
params = {'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [70]:
pd.DataFrame(randomized.named_steps["RandomizedSearch"].cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,45.357444,0.269049,0.514853,0.108898,100,5,1,sqrt,,False,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.999639,0.999573,0.999245,0.999376,0.999672,0.999501,0.000164,1
1,43.623409,0.1611,0.519337,0.100604,100,10,4,sqrt,30.0,False,"{'n_estimators': 100, 'min_samples_split': 10,...",0.999442,0.999507,0.999015,0.999343,0.999573,0.999376,0.000196,3
2,17.125389,0.201477,0.198778,0.005255,50,10,4,sqrt,30.0,True,"{'n_estimators': 50, 'min_samples_split': 10, ...",0.999376,0.999113,0.998325,0.998949,0.999277,0.999008,0.000371,5
3,33.291672,0.52813,0.37448,0.029977,100,10,4,sqrt,,True,"{'n_estimators': 100, 'min_samples_split': 10,...",0.99931,0.999179,0.998555,0.999015,0.99931,0.999074,0.000281,4
4,19.002685,1.6982,0.1506,0.028211,50,10,1,sqrt,20.0,False,"{'n_estimators': 50, 'min_samples_split': 10, ...",0.99954,0.999573,0.999245,0.999376,0.999737,0.999494,0.00017,2


# Random Forest com os melhores parametros

In [79]:
# Modelo
random_forest_final = Pipeline([("SMOTE", SMOTE()), ("Random Forest", RandomForestClassifier(**params, random_state=0))])

# Treino
random_forest_final.fit(X_treino, y_treino)

# Previsão
previsao_final = random_forest_final.predict(X_teste)

In [None]:
avaliacao(y_teste, previsao_final)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25375
           1       0.62      0.62      0.62        29

    accuracy                           1.00     25404
   macro avg       0.81      0.81      0.81     25404
weighted avg       1.00      1.00      1.00     25404

