In [None]:
%pip install imblearn


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import imblearn
from imblearn import under_sampling, over_sampling
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, precision_recall_curve, auc, mean_absolute_error ,mean_squared_error, median_absolute_error 
from sklearn.svm import LinearSVC
from sklearn.svm import SVC 
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE


## Importamos y exploramos el Dataset

In [None]:
df = pd.read_csv('DataSet_Fraud.csv')


### Column Name |   Description

step	|   represents a unit of time where 1 step equals 1 hour.

type	|   type of online transaction.

amount	|   the amount of the transaction.

nameOrig	|   customer starting the transaction.

oldbalanceOrg	 |   balance before the transaction.

newbalanceOrig	|   balance after the transaction.

nameDest	|   recipient of the transaction.

oldbalanceDest	|   initial balance of recipient before the transaction.

newbalanceDest	 |   the new balance of recipient after the transaction.

isFraud	|   fraud transaction.

In [None]:
df

In [None]:
df.value_counts(['isFraud'])

In [None]:
#Imprimimos la forma del dataset
    
print("Nro. de Filas y columnas ", df.shape )

In [None]:
#Chequeo de dtypes de todas las columnas

df.info()

In [None]:
#buscamos valores nulos 
df.isna().sum()

Dropeamos la columna Id que no nos interesa

In [None]:
#df.drop(['Id'], axis=1, inplace=True)

In [None]:
fraudes = df[df['isFraud']==1]
fraudes.describe().round(0)


In [None]:
print('De los fraudes se detectaron: \n\n',fraudes['type'].value_counts())

## Analizamos y modificamos los datos

In [None]:
df_dummy = pd.get_dummies(data=df, columns=['type'])

In [None]:
df_dummy.head(5)

Se ve que en muy pocos casos los destinatarios se repiten 2 veces y nunca se da una 3ra vez. Es información con alta cardinalidad y no aporta gran valor, salvo se quiera armar un registro de cuentas fraudulentas que en ese caso si aplican los nombres de cuenta

In [None]:
df_dummy_drop = df_dummy.drop(columns=['nameOrig','nameDest', 'isFlaggedFraud'])

In [None]:
fig,axes = plt.subplots(1,1,figsize=(18,8))
sns.heatmap(df_dummy_drop.corr(), 
            annot=True,
            ax=axes, 
            cmap='icefire',
            vmin=-1,
            vmax=1)

In [None]:
df_dummy_drop.corr()['isFraud'].sort_values(ascending = False)

Con estas variables es muy dificil predecir si es fraude ya que no tienen correlacion con la variable de salida.

In [None]:
df_dummy_drop['emptied'] = np.where(df_dummy_drop['newbalanceOrig']==0, 1, 0)
df_dummy_drop['origDiff'] = df_dummy_drop['oldbalanceOrg'] - df_dummy_drop['newbalanceOrig']
df_dummy_drop['destDiff'] =  df_dummy_drop['newbalanceDest'] - df_dummy_drop['oldbalanceDest']

In [None]:
df_dummy_drop.columns.to_list()

In [None]:
DF = df_dummy_drop[df_dummy_drop.isFraud==1]

In [None]:
DF = DF.reindex(columns=['step',
            'amount',
            'emptied',
            'origDiff',
            'destDiff',
            'oldbalanceOrg',
            'newbalanceOrig',
            'oldbalanceDest',
            'newbalanceDest',
            'isFraud',
            'type_CASH_IN',
            'type_CASH_OUT',
            'type_DEBIT',
            'type_PAYMENT',
            'type_TRANSFER'])


In [None]:
columns=['step',
        'amount',
        'emptied',
        'origDiff',
        'destDiff',
        #'oldbalanceOrg',
        #'newbalanceOrig',
        #'oldbalanceDest',
        #'newbalanceDest',
        #'isFraud',
        #'type_CASH_IN',
        #'type_CASH_OUT',
        #'type_DEBIT',
        #'type_PAYMENT',
        #'type_TRANSFER'
]
for column in columns:
    fig, ax = plt.subplots(1, 2, figsize=(20,4))
    sns.boxplot(data=DF, x=column, ax=ax[0])
    ax[0].set_title(f'{column.title()} Boxplot')
    sns.histplot(data=DF, x=column, ax=ax[1], kde=True)
    ax[1].set_title(f'{column.title()} Histogram')
    fig.suptitle(f'{column.title()} Distribution'.replace('_', ' '), fontsize=15, color='brown')

In [None]:
DF[(DF.amount==DF.origDiff)==False].head(60)

In [None]:
#(DF.emptied==1).value_counts().plot.pie(autopct='%1.1f%%')
(DF.amount==DF.origDiff).value_counts().plot.pie(autopct='%1.1f%%')

Analizo los casos fraudulentos:

*   Estan distribuidos de forma uniforme a lo largo del mes (30 dias = 743h o steps)
*   En el 98.1% de los casos la cuenta origen es vaciada
*   Menos del 1% de las veces el monto de la transacción y el saldo descontado no coinciden


In [None]:
(df.isFraud==1).value_counts().plot.pie(autopct='%1.3f%%')

Se está trabajando con una variable de salida muy desbalanceada, por lo tanto se buscará balancear las clases para entrenar el modelo y detectar con mayor presición la clase fraude = 1

In [None]:
df_dummy_drop = df_dummy_drop.drop(columns=[#'step',
                                            #'amount',
                                            #'emptied',
                                            #'origDiff',
                                            #'destDiff',
                                            'oldbalanceOrg',
                                            'newbalanceOrig',
                                            'oldbalanceDest',
                                            'newbalanceDest',
                                            #'isFraud',
                                            #'type_CASH_IN',
                                            #'type_CASH_OUT',
                                            #'type_DEBIT',
                                            #'type_PAYMENT',
                                            #'type_TRANSFER'
                                            ])

In [None]:
x = df_dummy_drop.drop(columns='isFraud')
y = df_dummy_drop.isFraud
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.10,random_state = 1000, stratify=y)
print(f'Train data shape : {x_train.shape}') 
print(f'Distribución de la variable salida : \n{y_train.value_counts(normalize=True)}')
print(f'Test data shape : {x_test.shape}') 
print(f'Distribución de la variable salida : \n{y_test.value_counts(normalize=True)}')

Transformamos los datos eliminando la mediana y escalamos de acuerdo con metodo utilizado:
*   MaxAbsScaler
*   MinMaxScaler
*   RobustScaler - Utilizamos este escalador por no estar influenciado por valores atípicos
*   StandardScaler

Para mas info ver: Comparar el efecto de diferentes escaladores en los datos con valores atípicos

https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#maxabsscaler

In [None]:
rs = RobustScaler()
x_train = rs.fit_transform(x_train)


## Entrenando el modelo

In [None]:
print("X Train : ", x_train.shape)
print("X Test  : ", x_test.shape)
print("Y Train : ", y_train.shape)
print("Y Test  : ", y_test.shape)
print('\nY las etiquetas se distribuyen entre train y test de la siguiente forma:\n')
print('Train: \n',y_train.value_counts(normalize=True),'\n Test: \n' , y_test.value_counts(normalize=True))

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
print(dt.score(x_test,y_test))