In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

Primero hacemos una lectura de la tabla, la cual es el resultado del proceso de EDA y ETL de nuestro equipo.

In [2]:
df = pd.read_excel('tabla_completa_.xlsx')

In [3]:
df.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,FraudIndicator,TransactionAmount,Category,AnomalyScore,Amount,CustomerID,Age
0,1,2022-01-01 00:00:00,2701,0,79.413607,Other,0.686699,55.530334,1952,50
1,2,2022-01-01 01:00:00,2070,0,12.053087,Online,0.081749,12.88118,1027,46
2,3,2022-01-01 02:00:00,2238,0,33.310357,Travel,0.023857,50.176322,1955,34
3,4,2022-01-01 03:00:00,2879,0,46.121117,Travel,0.876994,41.634001,1796,33
4,5,2022-01-01 04:00:00,2966,0,54.051618,Other,0.034059,78.122853,1946,18


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   TransactionID      1000 non-null   int64         
 1   Timestamp          1000 non-null   datetime64[ns]
 2   MerchantID         1000 non-null   int64         
 3   FraudIndicator     1000 non-null   int64         
 4   TransactionAmount  1000 non-null   float64       
 5   Category           1000 non-null   object        
 6   AnomalyScore       1000 non-null   float64       
 7   Amount             1000 non-null   float64       
 8   CustomerID         1000 non-null   int64         
 9   Age                1000 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 78.2+ KB


Analizando la información de nuestra tabla, podemos observar que no todos, los valores son numericos. Necesitamos valores numericos en nuestros features, para que se adecuen bien al momento de ser analizados por los algoritmos.

In [5]:
df['Category'].unique()

array(['Other', 'Online', 'Travel', 'Food', 'Retail'], dtype=object)

Para la columna Category podemos hacer One-Hot encoding. Esto distribuirá nuestros datos de tal forma que solo aquellos valores que correspondan a la categoria tengan un valor de 1, y el resto 0.

In [6]:
df = pd.get_dummies(df)

In [7]:
df.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,FraudIndicator,TransactionAmount,AnomalyScore,Amount,CustomerID,Age,Category_Food,Category_Online,Category_Other,Category_Retail,Category_Travel
0,1,2022-01-01 00:00:00,2701,0,79.413607,0.686699,55.530334,1952,50,0,0,1,0,0
1,2,2022-01-01 01:00:00,2070,0,12.053087,0.081749,12.88118,1027,46,0,1,0,0,0
2,3,2022-01-01 02:00:00,2238,0,33.310357,0.023857,50.176322,1955,34,0,0,0,0,1
3,4,2022-01-01 03:00:00,2879,0,46.121117,0.876994,41.634001,1796,33,0,0,0,0,1
4,5,2022-01-01 04:00:00,2966,0,54.051618,0.034059,78.122853,1946,18,0,0,1,0,0


In [8]:
df['Timestamp']

0     2022-01-01 00:00:00
1     2022-01-01 01:00:00
2     2022-01-01 02:00:00
3     2022-01-01 03:00:00
4     2022-01-01 04:00:00
              ...        
995   2022-02-11 11:00:00
996   2022-02-11 12:00:00
997   2022-02-11 13:00:00
998   2022-02-11 14:00:00
999   2022-02-11 15:00:00
Name: Timestamp, Length: 1000, dtype: datetime64[ns]

Ahora, para la columna Timestamp capturamos el valor de la hora y podemos descartar esta columna, ya que la hora, en este dataset, es lo mas significativo que cambia con cada fila.

In [9]:
df['Hour'] = df['Timestamp'].apply(lambda x : x.time().hour)

In [10]:
df.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,FraudIndicator,TransactionAmount,AnomalyScore,Amount,CustomerID,Age,Category_Food,Category_Online,Category_Other,Category_Retail,Category_Travel,Hour
0,1,2022-01-01 00:00:00,2701,0,79.413607,0.686699,55.530334,1952,50,0,0,1,0,0,0
1,2,2022-01-01 01:00:00,2070,0,12.053087,0.081749,12.88118,1027,46,0,1,0,0,0,1
2,3,2022-01-01 02:00:00,2238,0,33.310357,0.023857,50.176322,1955,34,0,0,0,0,1,2
3,4,2022-01-01 03:00:00,2879,0,46.121117,0.876994,41.634001,1796,33,0,0,0,0,1,3
4,5,2022-01-01 04:00:00,2966,0,54.051618,0.034059,78.122853,1946,18,0,0,1,0,0,4


Definimos nuestros features, y nuestros targets.

In [11]:
X = df.drop(['FraudIndicator', 'Timestamp'], axis = 1)
y = df['FraudIndicator']

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   TransactionID      1000 non-null   int64  
 1   MerchantID         1000 non-null   int64  
 2   TransactionAmount  1000 non-null   float64
 3   AnomalyScore       1000 non-null   float64
 4   Amount             1000 non-null   float64
 5   CustomerID         1000 non-null   int64  
 6   Age                1000 non-null   int64  
 7   Category_Food      1000 non-null   uint8  
 8   Category_Online    1000 non-null   uint8  
 9   Category_Other     1000 non-null   uint8  
 10  Category_Retail    1000 non-null   uint8  
 11  Category_Travel    1000 non-null   uint8  
 12  Hour               1000 non-null   int64  
dtypes: float64(3), int64(5), uint8(5)
memory usage: 67.5 KB


In [13]:
y.value_counts()

0    955
1     45
Name: FraudIndicator, dtype: int64

Ahora que todos nuestros valores son de tipo numérico podemos empezar a hacer el análisis mediante distintos algoritmos. Pero primeramente debemos observar que hay un gran desbalance entre los targets que son 0 y aquellos que son 1. Este desbalanace impedirá que nuestros algoritmos puedan aprender a detectar los casos sospechosos (1). Para eso aplicamos un resample de nuestros datos, para crear ese balance.

In [14]:
smote = SMOTE(random_state=42)

In [15]:
X_r, y_r = smote.fit_resample(X, y)

In [16]:
y_r.value_counts()

0    955
1    955
Name: FraudIndicator, dtype: int64

Procedemos a separar nuestro dataset en train y test sets. Esta vez no requeriremos del set de cross validation, ya que no necesitamos un manejo de hiperparametros en nuestros algoritmos.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_r, y_r, test_size=0.30, random_state=1)

In [18]:
print(f"the shape of the training set (input) is: {X_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the test set (input) is: {X_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (1337, 13)
the shape of the training set (target) is: (1337,)

the shape of the test set (input) is: (573, 13)
the shape of the test set (target) is: (573,)


Ahora necesitamos Nomalizar los datos ya que hay una gran diferencia de rango, entre las distintas columnas de nuestros features. Por ejemplo MerchantID esta en el rango de los miles, mientras que otros valores estan muy alejados de este rango.

In [19]:
scaler = StandardScaler().fit(X_train)
x_train = scaler.transform(X_train)

scaler_test = StandardScaler().fit(X_test)
x_test = scaler.transform(X_test)

DATA MODELLING

Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
lr = LogisticRegression()
lr_model = lr.fit(x_train, y_train)

In [22]:
y_pred_lr = lr_model.predict(x_test)

In [23]:
print("Classification Report for Logistic Regression: \n", classification_report(y_test, y_pred_lr))

Classification Report for Logistic Regression: 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       286
           1       1.00      0.82      0.90       287

    accuracy                           0.91       573
   macro avg       0.92      0.91      0.91       573
weighted avg       0.92      0.91      0.91       573



NAIVE BAYES

In [24]:
from sklearn.naive_bayes import GaussianNB

In [25]:
gnb = GaussianNB()
gnb_model = gnb.fit(x_train, y_train)

In [26]:
y_pred_gnb = gnb_model.predict(x_test)

In [27]:
print("Classification Report for Naive Bayes: \n", classification_report(y_test, y_pred_gnb))

Classification Report for Naive Bayes: 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       286
           1       1.00      0.82      0.90       287

    accuracy                           0.91       573
   macro avg       0.92      0.91      0.91       573
weighted avg       0.92      0.91      0.91       573



K Nearest Neighborg

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)

In [30]:
y_pred_knn = neigh.predict(x_test)

In [31]:
print("Classification Report for KNN: \n", classification_report(y_test, y_pred_knn))

Classification Report for KNN: 
               precision    recall  f1-score   support

           0       0.98      0.89      0.93       286
           1       0.90      0.98      0.94       287

    accuracy                           0.94       573
   macro avg       0.94      0.94      0.94       573
weighted avg       0.94      0.94      0.94       573



Support Vector Machine

In [32]:
from sklearn.svm import SVC

In [33]:
svm = SVC()
svm_model = svm.fit(x_train, y_train)

In [34]:
y_pred_svc = svm_model.predict(x_test)

In [35]:
print("Classification Report for Support Vector Machine: \n", classification_report(y_test, y_pred_svc))

Classification Report for Support Vector Machine: 
               precision    recall  f1-score   support

           0       0.93      0.97      0.95       286
           1       0.97      0.93      0.95       287

    accuracy                           0.95       573
   macro avg       0.95      0.95      0.95       573
weighted avg       0.95      0.95      0.95       573



Decision Trees

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [38]:
y_pred_dt=dt_model.predict(x_test)

In [39]:
print("Classification Report for Decision Trees: \n", classification_report(y_test, y_pred_dt))

Classification Report for Decision Trees: 
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       286
           1       0.94      0.95      0.94       287

    accuracy                           0.94       573
   macro avg       0.94      0.94      0.94       573
weighted avg       0.94      0.94      0.94       573



Después de comparar distintos algoritmos, escogemos aquel que tenga la mayor accuracy y f1-score. Finalmente realizamos pruebas en el mismo, con casos individuales para confirmar si realmente esta haciendo una buena predicción.

In [40]:
alg_result_desc = {'Algorithm': ['Log Regression', 'Naive Bayes', 'KNN', 'SVM', 'Decision Trees'],
                  'Accuracy': [accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_gnb), accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_svc), accuracy_score(y_test, y_pred_dt)],
                   'F1-Score': [f1_score(y_test, y_pred_lr), f1_score(y_test, y_pred_gnb), f1_score(y_test, y_pred_knn), f1_score(y_test, y_pred_svc), f1_score(y_test, y_pred_dt)]}
compare_algorithms = pd.DataFrame(data=alg_result_desc)

In [41]:
print(compare_algorithms)

        Algorithm  Accuracy  F1-Score
0  Log Regression  0.910995  0.902486
1     Naive Bayes  0.910995  0.902486
2             KNN  0.937173  0.940000
3             SVM  0.947644  0.946619
4  Decision Trees  0.944154  0.944828


Segun los resultados, escogemos el algoritmo SVM, el cual nos entrega las mejores métricas.

PRUEBA SINGULAR

In [48]:
x_try = X_test.iloc[2]
print(x_try)
print(f'The target should be: {y_test.iloc[2]}')
arr = x_try.values
reshaped_arr = arr.reshape((1, -1))

TransactionID         150.000000
MerchantID           2727.000000
TransactionAmount      24.965357
AnomalyScore            0.084458
Amount                 18.914684
CustomerID           1607.000000
Age                    29.000000
Category_Food           0.000000
Category_Online         0.000000
Category_Other          0.000000
Category_Retail         0.000000
Category_Travel         0.000000
Hour                    5.000000
Name: 1420, dtype: float64
The target should be: 1


In [49]:
x_try_norm = scaler_test.transform(reshaped_arr)

In [50]:
y_pred_try = svm_model.predict(x_try_norm)
print(f'La prediccion del algoritmo es: {y_pred_try[0]}')

La prediccion del algoritmo es: 1
