In [None]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [None]:
# df_train = pd.read_csv(r'./fraudTrain.csv')
# df_test = pd.read_csv(r'./fraudTest.csv')
pd.set_option('display.max_columns', None)
df_test = pd.read_csv(r'/content/drive/MyDrive/fraud_dataset/fraudTest.csv')
df_train = pd.read_csv(r'/content/drive/MyDrive/fraud_dataset/fraudTrain.csv')

def pre_processing(X):

    X = pd.get_dummies(X, columns=['category'], prefix='category')

    X['gender']=X['gender'].apply(lambda x : 1 if x=='M' else 0)
    X['gender']=X['gender'].astype(int)

    # Delete das colunas não utilizadas
    del_col=['merchant','first','last','street','zip','unix_time','Unnamed: 0','trans_num','cc_num', 'city','trans_date_trans_time','state','job','merch_lat','merch_long','lat','long','dob']
    X.drop(columns=del_col,inplace=True)


    return X

df_train_pp = pre_processing(df_train.copy())
df_test_pp = pre_processing(df_test.copy())


In [None]:
x_train=df_train_pp.drop('is_fraud',axis=1)
y_train=df_train_pp['is_fraud']
x_test=df_test_pp.drop('is_fraud',axis=1)
y_test=df_test_pp['is_fraud']

smt=SMOTE(sampling_strategy=0.4)
x_train, y_train = smt.fit_resample(x_train, y_train)



In [None]:
%%time
nr=NearMiss()
x_train, y_train = nr.fit_resample(x_train, y_train)

CPU times: user 1h 30min 8s, sys: 4.64 s, total: 1h 30min 13s
Wall time: 54min 23s


In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [None]:
%%time
param_grid = {
    "criterion": ['entropy', 'gini'],
    'n_estimators': [50 , 100, 150],
    'max_depth': [3, 5, 15],
    'min_samples_split': [5, 10, 20],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid, cv=3, scoring='accuracy')

grid_search.fit(x_train, y_train)
print("Melhores Hiperparâmetros:", grid_search.best_params_)

best_rf = grid_search.best_estimator_
best_rf.fit(x_train, y_train)
y_pred_best_rf = best_rf.predict(x_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print("Acurácia com Melhores Hiperparâmetros:", accuracy_best_rf)
print("\nClassificação por Random Forest:\n", classification_report(y_test, y_pred_best_rf))

Melhores Hiperparâmetros: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 50}
Acurácia com Melhores Hiperparâmetros: 0.9918969839073345

Classificação por Random Forest:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.30      0.85      0.45      2145

    accuracy                           0.99    555719
   macro avg       0.65      0.92      0.72    555719
weighted avg       1.00      0.99      0.99    555719

CPU times: user 5h 55min 46s, sys: 30.4 s, total: 5h 56min 17s
Wall time: 5h 57min 19s
