In [14]:
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns

except Exception as e:
    print(f"Error al importar pandas: {e}")

In [15]:
dfs = [pd.read_csv(f'parte_{i}.csv') for i in range(1, 6)]

# Unir todas las partes en un solo DataFrame y guardarlo en la variable 'frauds'
frauds = pd.concat(dfs, ignore_index=True)
frauds.sample(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
4118116,302,CASH_OUT,242219.7,C1594071807,0.0,0.0,C1304812371,1490424.05,1732643.75,0,0
4367895,310,CASH_OUT,521588.45,C1915349952,1090.08,0.0,C199394671,1404953.92,1926542.37,0,0
2930804,229,CASH_IN,335159.94,C796225233,3145913.96,3481073.9,C1245522854,339202.55,4042.62,0,0
1908292,166,CASH_IN,73683.26,C1304398733,2630090.8,2703774.05,C2124792966,1522605.34,1448922.09,0,0
1247794,134,PAYMENT,16342.17,C392608934,135596.35,119254.17,M900772919,0.0,0.0,0,0


In [17]:
BETTER_COLUMN_NAMES = {
    'step': 'step',
    'type': 'type',
    'amount': 'amount',
    'nameOrig': 'name_origen',
    'oldbalanceOrg': 'old_origen',
    'newbalanceOrig': 'new_origen',
    'nameDest': 'name_destino',
    'oldbalanceDest': 'old_destino',
    'newbalanceDest': 'new_destino',
    'isFraud': 'is_fraud',
    'isFlaggedFraud': 'is_flag_fraud',
}

frauds.rename(columns=BETTER_COLUMN_NAMES, inplace=True)
frauds.sample(5)

Unnamed: 0,step,type,amount,name_origen,old_origen,new_origen,name_destino,old_destino,new_destino,is_fraud,is_flag_fraud
4468564,324,CASH_OUT,247748.46,C385585462,25041.0,0.0,C1459856999,12574.0,260322.46,0,0
758514,38,CASH_OUT,46.64,C1508977213,0.0,0.0,C167092361,2476836.86,2476883.5,0,0
82444,10,PAYMENT,24623.12,C1056260641,0.0,0.0,M1156389446,0.0,0.0,0,0
3757044,279,PAYMENT,686.86,C1183202098,30738.0,30051.14,M269352807,0.0,0.0,0,0
5015759,353,CASH_OUT,20111.4,C392946204,0.0,0.0,C292814238,239694.58,259805.99,0,0


In [18]:
#dropeamos algunas columnas

frauds = frauds.drop(columns=["is_flag_fraud", "name_origen", "name_destino"])
frauds.sample(3)

Unnamed: 0,step,type,amount,old_origen,new_origen,old_destino,new_destino,is_fraud
1817650,163,CASH_IN,217473.34,5037103.62,5254576.96,1005138.48,787665.14,0
3752102,279,PAYMENT,20532.46,238279.34,217746.88,0.0,0.0,0
1677972,159,PAYMENT,6877.03,120663.63,113786.6,0.0,0.0,0


In [19]:
frauds.isnull().sum()

step           0
type           0
amount         0
old_origen     0
new_origen     0
old_destino    0
new_destino    0
is_fraud       0
dtype: int64

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper

In [21]:
# 60% train, 20% test, 20% validation
train, not_train = train_test_split(frauds, test_size=0.4, random_state=42)
validation, test = train_test_split(not_train, test_size=0.5, random_state=42)

train.shape, validation.shape, test.shape

((3817572, 8), (1272524, 8), (1272524, 8))

In [25]:
# Una primer version...
# Definimos el mapper. Recibe una lista de (columna/s, transformers)
mapper = DataFrameMapper([
    (['step'], [StandardScaler()]),
    (['amount'], [StandardScaler()]),
    (['old_origen'], [StandardScaler()]),
    (['new_origen'], [StandardScaler()]),
    (['old_destino'], [StandardScaler()]),
    (['new_destino'], [StandardScaler()]),
    (['type'], [OneHotEncoder()])
])

# Lo entrenamos con train
mapper.fit(train)



In [26]:
# vemos como transforma un sample:
sample = train.sample(5, random_state=42)

# Sample original:
sample

Unnamed: 0,step,type,amount,old_origen,new_origen,old_destino,new_destino,is_fraud
3015412,232,TRANSFER,345647.05,402710.17,57063.12,631070.29,976717.34,0
2546572,206,CASH_IN,237929.67,272293.0,510222.67,0.0,0.0,0
1209164,133,TRANSFER,225431.33,0.0,0.0,3007235.91,3232667.24,0
4979336,352,PAYMENT,14443.18,231110.39,216667.21,0.0,0.0,0
5706116,398,CASH_OUT,73901.36,61842.0,0.0,0.0,73901.36,0


In [27]:
# Sample transformado
mapper.transform(sample)

array([[-0.07993222,  0.27856995, -0.1493829 , -0.2730523 , -0.13761967,
        -0.06705362,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.26262466,  0.09792145, -0.19455857, -0.11800269, -0.32318701,
        -0.33297502,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.77556881,  0.07696099, -0.28887911, -0.29257659,  0.56109604,
         0.54715207,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.76326365, -0.27687878, -0.20882396, -0.21844339, -0.32318701,
        -0.33297502,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 1.08648873, -0.17716385, -0.26745744, -0.29257659, -0.32318701,
        -0.31285461,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ]])

In [28]:
#mostramos los nombres de las columnas que genera el maper
mapper.transformed_names_

['step',
 'amount',
 'old_origen',
 'new_origen',
 'old_destino',
 'new_destino',
 'type_0',
 'type_1',
 'type_2',
 'type_3',
 'type_4']

In [29]:
lr_model_si = Pipeline([
    ('mapper', mapper),
    ('classifier', LogisticRegression(random_state=42)),
])

lr_model_si.fit(train, train.is_fraud)

y_pred = lr_model_si.predict(validation)

y_pred



array([0, 0, 0, ..., 0, 0, 0])

In [31]:
from sklearn import metrics
metrics.f1_score(validation.is_fraud, y_pred)

#print(metrics.classification_report(validation.is_fraud, y_pred))

np.float64(0.5427937915742794)