In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import sklearn as sk
import seaborn as sb
import matplotlib.pyplot as pp
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, make_scorer

from numpy import mean

In [2]:
train_id = pd.read_csv( 'train_identity.csv' )
train_tran = pd.read_csv( 'train_transaction.csv' )
FraudDetection = pd.merge( train_tran, train_id, on='TransactionID', how='left')

In [3]:
FraudDetection.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
count,590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,...,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
mean,3282270.0,0.03499,7372311.0,135.027176,9898.734658,362.555488,153.194925,199.278897,290.733794,86.80063,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
std,170474.4,0.183755,4617224.0,239.162522,4901.170153,157.793246,11.336444,41.244453,101.741072,2.690623,...,30.37536,1.561302,141.095343,152.160327,198.847038,6.897665,2.372447,97.461089,32.101995,3.737502
min,2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
25%,3134635.0,0.0,3027058.0,43.321,6019.0,214.0,150.0,166.0,204.0,87.0,...,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
50%,3282270.0,0.0,7306528.0,68.769,9678.0,361.0,150.0,226.0,299.0,87.0,...,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
75%,3429904.0,0.0,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,...,225.0,15.0,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0
max,3577539.0,1.0,15811130.0,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,...,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [4]:
for col in FraudDetection.columns:
    Column = FraudDetection[col]
    if Column.dtype == 'object':
        if FraudDetection[col].isnull().mean(): FraudDetection = FraudDetection.drop( columns = col )
        else: FraudDetection[col].fillna( Column.mode()[0], inplace=True )
    else:
        FraudDetection[col].fillna( Column.mean(), inplace=True )
FraudDetection

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card5,addr1,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
0,2987000,0,86400,68.50,W,13926,362.555488,150.0,142.0,315.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
1,2987001,0,86401,29.00,W,2755,404.000000,150.0,102.0,325.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
2,2987002,0,86469,59.00,W,4663,490.000000,150.0,166.0,330.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
3,2987003,0,86499,50.00,W,18132,567.000000,150.0,117.0,476.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
4,2987004,0,86506,50.00,H,4497,514.000000,150.0,102.0,420.0,...,166.000000,14.237337,542.000000,144.000000,368.26982,16.002708,12.800927,329.608924,149.070308,32.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,362.555488,150.0,226.0,272.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
590536,3577536,0,15811049,39.50,W,10444,225.000000,150.0,224.0,204.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
590537,3577537,0,15811079,30.95,W,12037,595.000000,150.0,224.0,231.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
590538,3577538,0,15811088,117.00,W,7826,481.000000,150.0,224.0,387.0,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597


In [5]:
for col in FraudDetection.select_dtypes( include = ['object'] ).columns:
    encoder = OneHotEncoder( sparse_output = False ).set_output( transform = 'pandas' )
    transformed = encoder.fit_transform(FraudDetection[[col]])
    encoded = pd.DataFrame(transformed, columns = encoder.get_feature_names_out([col]))
    FraudDetection = pd.concat([FraudDetection, encoded], axis=1).drop(columns=[col])
FraudDetection

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_22,id_24,id_25,id_26,id_32,ProductCD_C,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W
0,2987000,0,86400,68.50,13926,362.555488,150.0,142.0,315.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
1,2987001,0,86401,29.00,2755,404.000000,150.0,102.0,325.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
2,2987002,0,86469,59.00,4663,490.000000,150.0,166.0,330.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
3,2987003,0,86499,50.00,18132,567.000000,150.0,117.0,476.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
4,2987004,0,86506,50.00,4497,514.000000,150.0,102.0,420.0,87.0,...,16.002708,12.800927,329.608924,149.070308,32.000000,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,6550,362.555488,150.0,226.0,272.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
590536,3577536,0,15811049,39.50,10444,225.000000,150.0,224.0,204.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
590537,3577537,0,15811079,30.95,12037,595.000000,150.0,224.0,231.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0
590538,3577538,0,15811088,117.00,7826,481.000000,150.0,224.0,387.0,87.0,...,16.002708,12.800927,329.608924,149.070308,26.508597,0.0,0.0,0.0,0.0,1.0


In [6]:
cor = FraudDetection.corr()
cor

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_22,id_24,id_25,id_26,id_32,ProductCD_C,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W
TransactionID,1.000000,0.014166,0.998280,0.012025,0.010122,-0.019757,-0.010079,-0.023822,-0.000351,0.054247,...,5.214336e-03,-3.644656e-03,2.038864e-03,1.354242e-03,-2.527309e-02,-0.008696,-0.170191,-0.120515,0.013181,0.159403
isFraud,0.014166,1.000000,0.013103,0.011320,-0.013640,0.003353,0.154004,-0.033410,0.004449,-0.024159,...,1.657863e-02,-2.587865e-04,4.719024e-03,1.392775e-02,2.834478e-02,0.161442,0.016784,0.004030,0.018515,-0.135549
TransactionDT,0.998280,0.013103,1.000000,0.011920,0.010625,-0.019062,-0.011208,-0.024051,-0.000048,0.049232,...,5.056965e-03,-2.953697e-03,1.993611e-03,1.748668e-03,-2.685596e-02,-0.010214,-0.163499,-0.117985,0.015869,0.154717
TransactionAmt,0.012025,0.011320,0.011920,1.000000,-0.005725,0.015960,-0.109595,0.003041,-0.007337,0.027991,...,-4.442348e-03,3.210801e-03,6.846694e-04,9.806548e-04,-7.430793e-03,-0.139600,-0.062948,0.036336,-0.044301,0.129420
card1,0.010122,-0.013640,0.010625,-0.005725,1.000000,0.004919,0.002961,-0.093323,0.019124,-0.000056,...,1.646926e-03,5.395147e-03,-9.934759e-04,7.669902e-03,-3.544305e-03,0.004041,-0.000025,-0.012091,0.003722,0.002637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ProductCD_C,-0.008696,0.161442,-0.010214,-0.139600,0.004041,0.057082,0.780768,-0.167281,0.002495,-0.194218,...,6.982903e-02,1.206892e-03,1.660946e-02,2.219096e-02,4.932749e-15,1.000000,-0.088175,-0.094608,-0.051346,-0.618477
ProductCD_H,-0.170191,0.016784,-0.163499,-0.062948,-0.000025,0.044051,-0.069288,0.029223,0.009902,-0.017697,...,-3.537861e-02,5.935602e-03,-1.922975e-02,2.078313e-03,9.706035e-02,-0.088175,1.000000,-0.063555,-0.034493,-0.415478
ProductCD_R,-0.120515,0.004030,-0.117985,0.036336,-0.012091,0.057485,-0.075613,-0.009858,0.007326,0.030820,...,-3.769537e-02,1.722255e-02,-6.435468e-03,-1.101469e-02,-1.611702e-02,-0.094608,-0.063555,1.000000,-0.037009,-0.445786
ProductCD_S,0.013181,0.018515,0.015869,-0.044301,0.003722,0.025896,-0.039743,-0.029775,0.030454,0.011329,...,-3.612626e-02,-4.290262e-02,4.839384e-03,-3.520851e-02,-1.321600e-01,-0.051346,-0.034493,-0.037009,1.000000,-0.241940


In [20]:
x = FraudDetection.drop( columns = 'isFraud' )
y = FraudDetection['isFraud']

y_corrs = cor['isFraud'].drop('isFraud').abs()
columns = y_corrs[ y_corrs < 0.2 ].index
x.drop( columns = columns, inplace = True )
x

Unnamed: 0,V44,V45,V52,V86,V87,V188,V189,V200,V201,V242,V244,V246,V257,V258
0,1.083891,1.120779,0.182695,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
1,1.000000,1.000000,0.000000,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
2,1.000000,1.000000,0.000000,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
3,1.000000,1.000000,0.000000,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
4,1.083891,1.120779,0.182695,1.064885,1.099456,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,1.083891,1.120779,0.182695,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
590536,1.000000,1.000000,0.000000,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
590537,1.000000,1.000000,0.000000,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351
590538,1.000000,1.000000,1.000000,1.000000,1.000000,1.014755,1.038314,1.119977,1.159106,1.113463,1.118562,1.183723,1.250993,1.34351


In [24]:
import pickle

with open( 'variables.pkl', 'wb' ) as f:
    pickle.dump( FraudDetection, f )
    pickle.dump( x, f )
    pickle.dump( cor, f )

In [15]:
x

Unnamed: 0,V189
1773,1.038314
1774,1.038314
1775,1.038314
1776,1.038314
1777,1.038314
...,...
590535,1.038314
590536,1.038314
590537,1.038314
590538,1.038314


In [23]:
while True:
    if len(x.columns) == 1: break
    vif = pd.DataFrame()
    vif['Variable'] = x.columns
    vif['Value'] = [ variance_inflation_factor( x.values, i ) for i in range(x.shape[1]) ]
    if vif.max()['Value'] < 5: break
    var = vif.max()['Variable']
    val = vif.max()['Value']
    print( f'Se elimina {var} con un VIF: {val}' )
    x = x.drop( columns = vif.max()['Variable'] )
x

Unnamed: 0,V188
0,1.014755
1,1.014755
2,1.014755
3,1.014755
4,1.000000
...,...
590535,1.014755
590536,1.014755
590537,1.014755
590538,1.014755


## Modelos de Clasificación

### k-NN

Dada la naturaleza de los modelos, se partirá el set en entrenamiento y test con tal de encontrar los mejores parámetros para ambas regresiones.

In [32]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
x_train, x_test, y_train, y_test = train_test_split( x, y )

In [34]:
pipeline = Pipeline([('classifier', KNeighborsClassifier())])

param_grid = [
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [ i for i in range( 1, 7 ) ]
    },
    {
        'classifier ': [LogisticRegression()],
        'classifier__C': [ 10**i for i in range(-2,3) ]
    }
]

gs = GridSearchCV( pipeline, param_grid, cv = 3, scoring = 'roc_auc' )
gs.fit( x_train, y_train )

print(gs.best_params_)
print(gs.best_score_)
print(f'Score: {gs.score( x_test, y_test )}')

{'classifier': KNeighborsClassifier(), 'classifier__n_neighbors': 3}
0.5681841357317405
Score: 0.583483503900651


De aquí obtemenos que el mejor modelo es un 3-nn con un AUC medio de 0.57 y 0.58 para el set de prueba. Ahora bien, analizemos mejor el modelo realizado.

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

y_pred = gs.predict( x_test )

pp.figure( figsize = (16,9) )
sb.heatmap( confusion_matrix( y_test, y_pred, annot = True, fmt = 'd', cbar = False ) )
pp.xlabel('Categoría predicha')
pp.ylabel('Categoría real')
pp.title('Matriz de confusión del modelo 3-NN')
pp.show()

In [None]:
gs