## Machine Learning

#### Importando Librerías

In [54]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [55]:
df = pd.read_excel('tabla_completa.xlsx')

In [56]:
df.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,FraudIndicator,TransactionAmount,Category,AnomalyScore,Amount,CustomerID
0,1,2022-01-01 00:00:00,2701,0,79.413607,Other,0.686699,55.530334,1952
1,2,2022-01-01 01:00:00,2070,0,12.053087,Online,0.081749,12.88118,1027
2,3,2022-01-01 02:00:00,2238,0,33.310357,Travel,0.023857,50.176322,1955
3,4,2022-01-01 03:00:00,2879,0,46.121117,Travel,0.876994,41.634001,1796
4,5,2022-01-01 04:00:00,2966,0,54.051618,Other,0.034059,78.122853,1946


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   TransactionID      1000 non-null   int64         
 1   Timestamp          1000 non-null   datetime64[ns]
 2   MerchantID         1000 non-null   int64         
 3   FraudIndicator     1000 non-null   int64         
 4   TransactionAmount  1000 non-null   float64       
 5   Category           1000 non-null   object        
 6   AnomalyScore       1000 non-null   float64       
 7   Amount             1000 non-null   float64       
 8   CustomerID         1000 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 70.4+ KB


Comprobamos que todos los valores que tenemos son numericos.

In [58]:
df['Category'].unique()

array(['Other', 'Online', 'Travel', 'Food', 'Retail'], dtype=object)

Para la columna Category hacemos One-Hot encoding

In [59]:
df = pd.get_dummies(df)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   TransactionID      1000 non-null   int64         
 1   Timestamp          1000 non-null   datetime64[ns]
 2   MerchantID         1000 non-null   int64         
 3   FraudIndicator     1000 non-null   int64         
 4   TransactionAmount  1000 non-null   float64       
 5   AnomalyScore       1000 non-null   float64       
 6   Amount             1000 non-null   float64       
 7   CustomerID         1000 non-null   int64         
 8   Category_Food      1000 non-null   uint8         
 9   Category_Online    1000 non-null   uint8         
 10  Category_Other     1000 non-null   uint8         
 11  Category_Retail    1000 non-null   uint8         
 12  Category_Travel    1000 non-null   uint8         
dtypes: datetime64[ns](1), float64(3), int64(4), uint8(5)
memory usag

Capturamos el valor de la hora y podemos descartar el TimeStamp, ya que la hora es lo unico que cambia.

In [61]:
df['Hour'] = df['Timestamp'].apply(lambda x : x.time().hour)

In [62]:
df.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,FraudIndicator,TransactionAmount,AnomalyScore,Amount,CustomerID,Category_Food,Category_Online,Category_Other,Category_Retail,Category_Travel,Hour
0,1,2022-01-01 00:00:00,2701,0,79.413607,0.686699,55.530334,1952,0,0,1,0,0,0
1,2,2022-01-01 01:00:00,2070,0,12.053087,0.081749,12.88118,1027,0,1,0,0,0,1
2,3,2022-01-01 02:00:00,2238,0,33.310357,0.023857,50.176322,1955,0,0,0,0,1,2
3,4,2022-01-01 03:00:00,2879,0,46.121117,0.876994,41.634001,1796,0,0,0,0,1,3
4,5,2022-01-01 04:00:00,2966,0,54.051618,0.034059,78.122853,1946,0,0,1,0,0,4


In [63]:
X = df.drop(['FraudIndicator', 'Timestamp'], axis = 1)
y = df['FraudIndicator']

In [64]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   TransactionID      1000 non-null   int64  
 1   MerchantID         1000 non-null   int64  
 2   TransactionAmount  1000 non-null   float64
 3   AnomalyScore       1000 non-null   float64
 4   Amount             1000 non-null   float64
 5   CustomerID         1000 non-null   int64  
 6   Category_Food      1000 non-null   uint8  
 7   Category_Online    1000 non-null   uint8  
 8   Category_Other     1000 non-null   uint8  
 9   Category_Retail    1000 non-null   uint8  
 10  Category_Travel    1000 non-null   uint8  
 11  Hour               1000 non-null   int64  
dtypes: float64(3), int64(4), uint8(5)
memory usage: 59.7 KB


In [65]:
smote = SMOTE(random_state=42)

In [66]:
X_r, y_r = smote.fit_resample(X, y)

In [67]:
y_r.value_counts()

0    955
1    955
Name: FraudIndicator, dtype: int64

Necesitamos Nomalizar los datos ya que hay una gran diferencia de rango

In [68]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_r)

In [69]:
x_train, x_test, y_train, y_test = train_test_split(X_norm, y_r, test_size=0.40, random_state=1)

In [70]:
print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (1146, 12)
the shape of the training set (target) is: (1146,)

the shape of the test set (input) is: (764, 12)
the shape of the test set (target) is: (764,)


DATA MODELLING

Logistic Regression

In [71]:
from sklearn.linear_model import LogisticRegression

In [72]:
lr = LogisticRegression()
lr_model = lr.fit(x_train, y_train)

In [73]:
y_pred_lr = lr_model.predict(x_test)

In [74]:
print("Classification Report for Logistic Regression: \n", classification_report(y_test, y_pred_lr))

Classification Report for Logistic Regression: 
               precision    recall  f1-score   support

           0       0.86      0.99      0.92       391
           1       0.99      0.83      0.90       373

    accuracy                           0.91       764
   macro avg       0.92      0.91      0.91       764
weighted avg       0.92      0.91      0.91       764



XGB

In [75]:
from xgboost import XGBClassifier

In [76]:
bst = XGBClassifier(n_estimators=2, max_depth=3, learning_rate=1, objective='binary:logistic')
bst.fit(x_train, y_train)

In [77]:
y_pred_bst = bst.predict(x_test)

In [78]:
print("Classification Report for XGB: \n", classification_report(y_test, y_pred_bst))

Classification Report for XGB: 
               precision    recall  f1-score   support

           0       0.87      0.98      0.93       391
           1       0.98      0.85      0.91       373

    accuracy                           0.92       764
   macro avg       0.93      0.92      0.92       764
weighted avg       0.93      0.92      0.92       764



NAIVE BAYES

In [79]:
from sklearn.naive_bayes import GaussianNB

In [80]:
gnb = GaussianNB()
gnb_model = gnb.fit(x_train, y_train)

In [81]:
y_pred_gnb = gnb_model.predict(x_test)

In [82]:
print("Classification Report for Naive Bayes: \n", classification_report(y_test, y_pred_gnb))

Classification Report for Naive Bayes: 
               precision    recall  f1-score   support

           0       0.85      0.97      0.91       391
           1       0.96      0.82      0.89       373

    accuracy                           0.90       764
   macro avg       0.91      0.90      0.90       764
weighted avg       0.91      0.90      0.90       764



K Nearest Neighborg

In [83]:
from sklearn.neighbors import KNeighborsClassifier

In [84]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)

In [85]:
y_pred_knn = neigh.predict(x_test)

In [86]:
print("Classification Report for KNN: \n", classification_report(y_test, y_pred_knn))

Classification Report for KNN: 
               precision    recall  f1-score   support

           0       0.97      0.88      0.92       391
           1       0.89      0.97      0.93       373

    accuracy                           0.93       764
   macro avg       0.93      0.93      0.93       764
weighted avg       0.93      0.93      0.93       764



Support Vector Machine

In [87]:
from sklearn.svm import SVC

In [88]:
svm = SVC()
svm_model = svm.fit(x_train, y_train)

In [89]:
y_pred_svc = svm_model.predict(x_test)

In [90]:
print("Classification Report for Support Vector Machine: \n", classification_report(y_test, y_pred_svc))

Classification Report for Support Vector Machine: 
               precision    recall  f1-score   support

           0       0.91      0.96      0.93       391
           1       0.96      0.90      0.93       373

    accuracy                           0.93       764
   macro avg       0.93      0.93      0.93       764
weighted avg       0.93      0.93      0.93       764



Decision Trees

In [91]:
from sklearn.tree import DecisionTreeClassifier

In [92]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [93]:
y_pred_dt=dt_model.predict(x_test)

In [94]:
print("Classification Report for Decision Trees: \n", classification_report(y_test, y_pred_dt))

Classification Report for Decision Trees: 
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       391
           1       0.92      0.95      0.94       373

    accuracy                           0.94       764
   macro avg       0.94      0.94      0.94       764
weighted avg       0.94      0.94      0.94       764



Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

In [96]:
model_rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model_rf.fit(x_train, y_train)

In [97]:
y_pred_rf = model_rf.predict(x_test)

In [98]:
print("Classification Report for Random Forest: \n", classification_report(y_test, y_pred_rf))

Classification Report for Random Forest: 
               precision    recall  f1-score   support

           0       0.91      0.84      0.88       391
           1       0.85      0.91      0.88       373

    accuracy                           0.88       764
   macro avg       0.88      0.88      0.88       764
weighted avg       0.88      0.88      0.88       764

