In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [40]:
columns = [
    "date_created", "e-mail", "external_reference", "operation_id",
    "status", "status_detail", "transaction_amount", "installments",
    "payment_type", "hour_created", "DOW_created", "month_created","Shp_Carrier",
    "Shipping", "bill_zipcode", "shp_zipcode", "fraud_flag"
]

target = ["fraud_flag"]

In [41]:
# Load the data
file_path = Path('Resources/Whole_Collection.csv')
df = pd.read_csv(file_path)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.head()

Unnamed: 0,date_created,e-mail,external_reference,operation_id,status,status_detail,transaction_amount,installments,payment_type,hour_created,DOW_created,month_created,Shp_Carrier,Shipping,bill_zipcode,shp_zipcode,fraud_flag
0,17/05/2022 17:02,34,375285,22439390268,approved,accredited,4549.0,1,debit_card,17,1,5,DHL - Servicio Express (Entrega promedio de 1 ...,139,88660,88660,Not Fraud
1,17/05/2022 17:00,35,375284,22439310665,approved,accredited,4549.0,1,debit_card,17,1,5,DHL - Servicio Express (Entrega promedio de 1 ...,139,88660,88660,Not Fraud
2,17/05/2022 13:23,36,375213,22433076416,rejected,cc_rejected_high_risk,6378.0,3,credit_card,13,1,5,DHL - Servicio Express (Entrega promedio de 1 ...,139,89350,89350,Not Fraud
3,17/05/2022 13:26,37,375217,22433120457,rejected,cc_rejected_high_risk,6378.0,1,credit_card,13,1,5,DHL - Servicio Express (Entrega promedio de 1 ...,139,89350,89350,Not Fraud
4,17/05/2022 15:01,38,375235,22435577370,approved,accredited,6689.0,9,credit_card,15,1,5,Federal Express - Servicio Económico (Entrega ...,0,55067,55067,Not Fraud


In [42]:
df.columns



Index(['date_created', 'e-mail', 'external_reference', 'operation_id',
       'status', 'status_detail', 'transaction_amount', 'installments',
       'payment_type', 'hour_created', 'DOW_created', 'month_created',
       'Shp_Carrier', 'Shipping', 'bill_zipcode', 'shp_zipcode', 'fraud_flag'],
      dtype='object')

In [43]:
# Create our features
X = df.drop("fraud_flag", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df[target]

In [44]:
# Check the balance of our target values
y['fraud_flag'].value_counts()

Not Fraud    19296
Fraud          332
Name: fraud_flag, dtype: int64

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [46]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print('Original train y: ')
print(y_train['fraud_flag'].value_counts())
print('Resampled train y: ')
print(y_resampled['fraud_flag'].value_counts())

Counter(y_resampled)

Original train y: 
Not Fraud    14472
Fraud          249
Name: fraud_flag, dtype: int64
Resampled train y: 
Not Fraud    14472
Fraud        14472
Name: fraud_flag, dtype: int64


Counter({'fraud_flag': 1})

In [47]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [48]:
y_pred = model.predict(X_test)


# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5

In [50]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fraud       0.00      0.00      1.00      0.00      0.00      0.00        83
  Not Fraud       0.98      1.00      0.00      0.99      0.00      0.00      4824

avg / total       0.97      0.98      0.02      0.97      0.00      0.00      4907



In [51]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   0,   83],
       [   0, 4824]], dtype=int64)