## Detecção de fraudes utilizando base de dados do Kaggle

In [1]:
# Importando bibliotecas

import pandas as pd

In [2]:
df = pd.read_csv('./dados/fraud_dataset_example.csv')

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
colunas = {
    'isFraud': 'fraude',
    'isFlaggedFraud':'super_fraude',
    'step':'tempo',
    'type':'tipo',
    'amount':'valor',
    'nameOrig':'cliente1',
    'oldbalanceOrg':'saldo_inicial_c1',
    'newbalanceOrig':'novo_saldo_c1',
    'nameDest':'cliente2',
    'oldbalanceDest':'saldo_inicial_c2',
    'newbalanceDest':'novo_saldo_c2',
}
df = df.rename(columns = colunas)
df.head()


Unnamed: 0,tempo,tipo,valor,cliente1,saldo_inicial_c1,novo_saldo_c1,cliente2,saldo_inicial_c2,novo_saldo_c2,fraude,super_fraude
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tempo,101613.0,8.523457,1.820681,1.0,8.0,9.0,10.0,10.0
valor,101613.0,174090.1,345019.9,0.32,10016.59,53385.41,212498.4,10000000.0
saldo_inicial_c1,101613.0,907175.3,2829575.0,0.0,0.0,20190.47,194715.0,38939424.03
novo_saldo_c1,101613.0,923499.2,2867319.0,0.0,0.0,0.0,219217.76,38946233.02
saldo_inicial_c2,101613.0,881042.8,2399949.0,0.0,0.0,21058.0,591921.7,34008736.98
novo_saldo_c2,101613.0,1183998.0,2797761.0,0.0,0.0,51783.43,1063121.64,38946233.02
fraude,101613.0,0.001141586,0.03376824,0.0,0.0,0.0,0.0,1.0
super_fraude,101613.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.shape

(101613, 11)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101613 entries, 0 to 101612
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   tempo             101613 non-null  int64  
 1   tipo              101613 non-null  object 
 2   valor             101613 non-null  float64
 3   cliente1          101613 non-null  object 
 4   saldo_inicial_c1  101613 non-null  float64
 5   novo_saldo_c1     101613 non-null  float64
 6   cliente2          101613 non-null  object 
 7   saldo_inicial_c2  101613 non-null  float64
 8   novo_saldo_c2     101613 non-null  float64
 9   fraude            101613 non-null  int64  
 10  super_fraude      101613 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 8.5+ MB


In [8]:
df.groupby('fraude').tempo.count()

fraude
0    101497
1       116
Name: tempo, dtype: int64

In [9]:
df.isnull().values.any()

False

In [10]:
df = pd.get_dummies(data=df, columns=['tipo'])
df.head()

Unnamed: 0,tempo,valor,cliente1,saldo_inicial_c1,novo_saldo_c1,cliente2,saldo_inicial_c2,novo_saldo_c2,fraude,super_fraude,tipo_CASH_IN,tipo_CASH_OUT,tipo_DEBIT,tipo_PAYMENT,tipo_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,False,False,False,True,False
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,False,False,False,True,False
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,False,False,False,False,True
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,False,True,False,False,False
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,False,False,False,True,False


In [11]:
df = df.drop(['cliente1', 'cliente2', 'super_fraude'], axis = 1)

## Regressão logística

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics


In [13]:
x = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [14]:
from sklearn.model_selection import train_test_split

SEED = 42

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = SEED)

In [15]:
dt =  DecisionTreeClassifier(max_depth = 5, random_state=SEED)

In [19]:
model = dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

print("Acurácia:",metrics.accuracy_score(y_test, y_pred))
print("Precisão:", metrics.precision_score(y_test, y_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='micro')) 
print("F1:",metrics.f1_score(y_test, y_pred, average='micro'))

Acurácia: 0.38379782711384036
Precisão: 0.38379782711384036
Recall: 0.38379782711384036
F1: 0.38379782711384036
