### ITESM Instituto Tecnológico de Estudios Superiores de Monterrey
### Course:     MLOps Machine Learning Operations
#### Teacher:   Carlos Mejia
#### Student:   Francisco Javier Torres Zenón  A01688757

### References
* Dataset and baseline notebook copied from 
[Online Payments Fraud Detection Dataset | Kaggle](https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset) 
* Dataset: 
https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset/download?datasetVersionNumber=1
* Original Baseline notebook: https://www.kaggle.com/code/nehahatti/online-payments-fraud-detection-project/notebook

### About this notebook
This notebook was taken from [Kaggle](http://www.kaggle.com) and has been rewriten 
for better reading and understanding.

1. The task is to predict online payment fraud, given a number of features from online 
transfer/deposits transactions.
2. On Kaggle there were several notebooks related to this dataset
(Decision Tree, Logistic Regresion, KNN, Gradient Boosting Classifier).
3. As a Baseline I choose one of the most accurated and simpler one, a notebook using
 the Decision Tree algorithm.

#### 1. SETUP 
Imports and configuration files needed to run this notebook


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os

import joblib
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# import os # This is an unused import

#### 2. Data Load
Instructions to load the dataset.

Please unload the dataset from: 
* Dataset: https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset/download?datasetVersionNumber=1

and unzip the file ```PS_20174392719_1491204439457_log.csv``` in the folder ```../files/```

### BEFORE

```python
INPUT_FILES_PATH = "./Refactor/mlops_project/mlops_project/data/"
INPUT_FILE_NAME = "PS_20174392719_1491204439457_log.csv"
INPUT_FILE = INPUT_FILES_PATH + INPUT_FILE_NAME
Original_python_DIRECTORY='/Users/francisco.torres/Documents/GitHub/MLOps_project/'

#Change location to the refactored directory
print(os.getcwd())
os.chdir(Original_python_DIRECTORY)
print(os.getcwd())

Transactions_df = pd.read_csv(INPUT_FILE)
```
### NOW

In [None]:
# root folder

# Add the parent directory to sys.path
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)


class RetrieveURLZIP_ExtractFile:
    """
    A class for retrieving a ZIP file from a given URL, UNZIP it on a ./Data/ directory for further analysis.
    Result

    Parameters:
        url (str): The URL from which the data will be loaded.
    Attributes:
        url (str): The URL from which the data will be loaded.

    Example usage:
    ```
    URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
    data_retriever = RetrieveURLZIP_ExtractFile(URL)
    result = data_retriever.retrieve_data()
    print(result)
    ```
    """

    DATASETS_DIR = "./data/"  # Directory where data will be saved.
    RETRIEVED_DATA = "retrieved_data.csv"  # File name for the retrieved data.

    def __init__(self, url):
        self.url = url
        self.logfile = MyLogger("RetrieveURLZIP_ExtractFile", logging.DEBUG, __name__)

    def retrieve_data(self):
        """
        Retrieves data from the specified URL, processes it, and stores it in a CSV file.

        Returns:
            str: A message indicating the location of the stored data.
        """

        DATASETS_DIR = "./data/"  # Directory where data will be unzip.

        # Create directory if it does not exist
        if not os.path.exists(DATASETS_DIR):
            os.makedirs(DATASETS_DIR)
            #    print(f"Directory '{DATASETS_DIR}' created successfully.")
            self.logfile.debug(f"Directory '{DATASETS_DIR}' created successfully.")
        else:
            self.logfile.debug(f"Directory '{DATASETS_DIR}' already exists.")

        # currentdir= os.curdir()

        # Retrieve zip file from specific URL
        # Unzip file to DATASET_DIR directory

        with urlopen(self.url) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(DATASETS_DIR)
        ret = f"Data unzipped in {self.DATASETS_DIR}"
        self.logfile.info(ret)
        return ret


# Usage Example:
# URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
# data_retriever = DataRetriever(URL)
# result = data_retriever.retrieve_data()
# print(result)


class DataRetriever:
    """
    A class for retrieving data from a given FILE and processing it for further analysis.

    Parameters:
        csv_filename (str): The path+filename from which the data will be loaded.

    Attributes:
        csv_filename (str): The path+filename from which the data will be loaded.

    Example usage:
    ```
    CSV_FILENAME = './data/PS_log.csv'
    data_retriever = DataRetriever(CSV_FILENAME)
    result = data_retriever.retrieve_data()
    print(result)
    ```
    """

    DROP_COLS = [
        "step",
        "nameOrig",
        "nameDest",
        "oldbalanceDest",
        "newbalanceDest",
        "isFlaggedFraud",
    ]

    DATASETS_DIR = "./data/"  # Directory where data will be saved.
    RETRIEVED_DATA = "retrieved_data.csv"  # File name for the retrieved data.

    def __init__(self, csvfile):
        self.csvfile = csvfile

    def retrieve_data(self):
        """
        Retrieves data from the specified URL, processes it, and stores it in a CSV file.

        Returns:
            str: A message indicating the location of the stored data.
        """
        # Loading data from specific URL
        data = pd.read_csv(self.csvfile)

        # Drop irrelevant columns
        data.drop(self.DROP_COLS, axis=1, inplace=True)

        # Transform categorical attribute 'type'
        # data["type"] = data["type"].map(
        #    {"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5}
        # )

        # Create directory if it does not exist
        if not os.path.exists(self.DATASETS_DIR):
            os.makedirs(self.DATASETS_DIR)
            print(f"Directory '{self.DATASETS_DIR}' created successfully.")
        else:
            print(f"Directory '{self.DATASETS_DIR}' already exists.")

        # Save data to CSV file
        data.to_csv(self.DATASETS_DIR + self.RETRIEVED_DATA, index=False)

        return f"Data stored in {self.DATASETS_DIR + self.RETRIEVED_DATA}"


# Usage Example:
# URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
# data_retriever = DataRetriever(URL)
# result = data_retriever.retrieve_data()
# print(result)

In [8]:
Transactions_df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [9]:
Transactions_df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [10]:
Transactions_df.type.value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [11]:
type = Transactions_df["type"].value_counts()
type

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [12]:
# print transaction type counts
transaction = type.index
print(transaction)
quantity = type.value_counts()
print(quantity)

Index(['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT'], dtype='object', name='type')
count
2237500    1
2151495    1
1399284    1
532909     1
41432      1
Name: count, dtype: int64


In [13]:
# show dataframe
Transactions_df.describe

<bound method NDFrame.describe of          step      type      amount     nameOrig  oldbalanceOrg  \
0           1   PAYMENT     9839.64  C1231006815      170136.00   
1           1   PAYMENT     1864.28  C1666544295       21249.00   
2           1  TRANSFER      181.00  C1305486145         181.00   
3           1  CASH_OUT      181.00   C840083671         181.00   
4           1   PAYMENT    11668.14  C2048537720       41554.00   
...       ...       ...         ...          ...            ...   
6362615   743  CASH_OUT   339682.13   C786484425      339682.13   
6362616   743  TRANSFER  6311409.28  C1529008245     6311409.28   
6362617   743  CASH_OUT  6311409.28  C1162922333     6311409.28   
6362618   743  TRANSFER   850002.52  C1685995037      850002.52   
6362619   743  CASH_OUT   850002.52  C1280323807      850002.52   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
0             160296.36  M1979787155            0.00            0.00        0   

#### 4. Data transformation
Data cleaning, data transformation.

#### BEFORE
```python
# Transform categorical attribute 'type'
Transactions_df["type"] = Transactions_df["type"].map(
    {"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5}
)
```
#### NOW

In [14]:
class Change_TransactionType(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to change Transaction type from categorical to numerical

    Parameters:
        None

    Attributes:
        variable (str): The name of the column (variable) from which letters will be extracted.

    Methods:
        fit(X, y=None):
            This method does not perform any actual training or fitting.
            It returns the transformer instance itself.

        transform(X):
            This will change from categorical to numerical

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    extractor = ExtractLetters()

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('extractor', extractor),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """

    def __init__(self):
        """
        Initialize the ExtractLetters transformer.

        Parameters:
            None
        """
        self.variable = "type"
        self.logfile = MyLogger("Change_TransactionType", logging.DEBUG, __name__)

    def fit(self, X, y=None):
        """
        This method does not perform any actual training or fitting, as it is not necessary for this transformer.
        It returns the transformer instance itself.

        Parameters:
            X (pd.DataFrame): Input data to be transformed. Not used in this method.
            y (pd.Series or np.array, optional): Target variable. Not used in this method.

        Returns:
            self (type): The transformer instance.
        """
        return self

    def transform(self, X):
        """


        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with letters extracted from the specified variable.
        """
        self.logfile.debug("Class transformation for transaction TYPE")
        X = X.copy()
        X[self.variable] = X[self.variable].map(
            {"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5}
        )
        return X



In [15]:
# Transactions_df['isFraud'] = Transactions_df['isFraud'].map({0:'No_Fraud', 1:'Fraud'})

# select the numerical attributes

Transactions_df = Transactions_df[
    [
        "type",
        "amount",
        "oldbalanceOrg",
        "newbalanceOrig",
        "oldbalanceDest",
        "newbalanceDest",
        "isFraud",
    ]
]

In [16]:
Transactions_df.head(10)

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,2,9839.64,170136.0,160296.36,0.0,0.0,0
1,2,1864.28,21249.0,19384.72,0.0,0.0,0
2,4,181.0,181.0,0.0,0.0,0.0,1
3,1,181.0,181.0,0.0,21182.0,0.0,1
4,2,11668.14,41554.0,29885.86,0.0,0.0,0
5,2,7817.71,53860.0,46042.29,0.0,0.0,0
6,2,7107.77,183195.0,176087.23,0.0,0.0,0
7,2,7861.64,176087.23,168225.59,0.0,0.0,0
8,2,4024.36,2671.0,0.0,0.0,0.0,0
9,5,5337.77,41720.0,36382.23,41898.0,40348.79,0


In [17]:
correlation_summary = Transactions_df.corr()
correlation_summary

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
type,1.0,0.198987,0.260418,0.270669,0.066255,0.079111,0.016171
amount,0.198987,1.0,-0.002762,-0.007861,0.294137,0.459304,0.076688
oldbalanceOrg,0.260418,-0.002762,1.0,0.998803,0.066243,0.042029,0.010154
newbalanceOrig,0.270669,-0.007861,0.998803,1.0,0.067812,0.041837,-0.008148
oldbalanceDest,0.066255,0.294137,0.066243,0.067812,1.0,0.976569,-0.005885
newbalanceDest,0.079111,0.459304,0.042029,0.041837,0.976569,1.0,0.000535
isFraud,0.016171,0.076688,0.010154,-0.008148,-0.005885,0.000535,1.0


In [18]:
correlation_summary["isFraud"].sort_values(ascending=False)

isFraud           1.000000
amount            0.076688
type              0.016171
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

#### 5. Model training
* Load sklearn Decision Tree Algorithm
* Split the dataset
* Train the model

#### BEFORE
```python
numerical_features = np.array(
    Transactions_df[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]]
)
var_objective = np.array(Transactions_df[["isFraud"]])

X_train, X_test, y_train, y_test = train_test_split(
    numerical_features, var_objective, test_size=0.2, random_state=3
)
DecisionTree_model = DecisionTreeClassifier()

DecisionTree_model.fit(X_train, y_train)
```

#### NOW

In [None]:
class FraudDetectionPipeline:
    """
    A class representing the Online Fraud Detection data processing and modeling pipeline.

    Attributes:
        NUMERICAL_VARS (list): A list of numerical variables in the dataset.
        CATEGORICAL_VARS_WITH_NA (list): A list of categorical variables with missing values.
        NUMERICAL_VARS_WITH_NA (list): A list of numerical variables with missing values.
        CATEGORICAL_VARS (list): A list of categorical variables in the dataset.
        SEED_MODEL (int): A seed value for reproducibility.

    Methods:
        create_pipeline(): Create and return the Titanic data processing pipeline.
    """

    def __init__(
        self,
        seed_model,
        numerical_vars,
        categorical_vars_with_na,
        numerical_vars_with_na,
        categorical_vars,
        selected_features,
    ):
        self.SEED_MODEL = seed_model
        self.NUMERICAL_VARS = numerical_vars
        self.CATEGORICAL_VARS_WITH_NA = categorical_vars_with_na
        self.NUMERICAL_VARS_WITH_NA = numerical_vars_with_na
        self.CATEGORICAL_VARS = categorical_vars
        self.SEED_MODEL = seed_model
        self.SELECTED_FEATURES = selected_features
        # logging instance
        self.logfile = MyLogger("FraudDetectionPipeline", logging.DEBUG, __name__)

    def create_pipeline(self):
        """
        Create and return the DataFraud data processing pipeline.

        Returns:
            Pipeline: A scikit-learn pipeline for data processing and modeling.
        """
        self.logfile.debug("Create data pipeline")
        self.PIPELINE = Pipeline(
            [
                ("Change transaction type", Change_TransactionType()),
            ]
        )
        return self.PIPELINE

    def fit_DecisionTree(self, X_train, y_train):
        """
        Fit a Decision Tree model using the predefined data preprocessing pipeline.

        Parameters:
        - X_train (pandas.DataFrame or numpy.ndarray): The training input data.
        - y_train (pandas.Series or numpy.ndarray): The target values for training.

        Returns:
        - Decision Tree _model (DecisionTree): The fitted Decision Tree model.
        """
        self.logfile.debug("Train Decision Model")
        Decision_tree_model = DecisionTreeClassifier()
        pipeline = self.create_pipeline()
        pipeline.fit(X_train.values, y_train)
        Decision_tree_model.fit(pipeline.transform(X_train).values, y_train)
        return Decision_tree_model

    def transform_test_data(self, X_test):
        """
        Apply the data preprocessing pipeline on the test data.

        Parameters:
        - X_test (pandas.DataFrame or numpy.ndarray): The test input data.

        Returns:
        - transformed_data (pandas.DataFrame or numpy.ndarray): The preprocessed test data.
        """
        self.logfile.debug("Transform raw data w/transformations needed in Model")
        pipeline = self.create_pipeline()
        return pipeline.transform(X_test)


In [24]:
DecisionTree_model.score(X_test, y_test)

0.9996785915236176

In [25]:
y_pred = DecisionTree_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1270690     180]
 [    229    1425]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270870
           1       0.89      0.86      0.87      1654

    accuracy                           1.00   1272524
   macro avg       0.94      0.93      0.94   1272524
weighted avg       1.00      1.00      1.00   1272524



In [28]:
DecisionTree_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

### 6. Save Model

In [26]:
TRAINED_MODEL_DIR = "../trained_models/"
PIPELINE_NAME = "decision_tree"
PIPELINE_SAVE_FILE = f"{PIPELINE_NAME}_output.pkl"

# Save the model using joblib
MODEL_FILED = TRAINED_MODEL_DIR + PIPELINE_SAVE_FILE
joblib.dump(DecisionTree_model, MODEL_FILED)

FileNotFoundError: [Errno 2] No such file or directory: '../trained_models/decision_tree_output.pkl'

### 7. Load saved model and predict data

In [67]:
# Load saved model
DecisionTree_loaded = joblib.load(MODEL_FILED)

In [68]:
# NO_FRAUD test
test_data = np.array([[5, 7880, 7880, 0.0]])
DecisionTree_loaded.predict(test_data)[0]

0

In [69]:
# FRAUD test
test_data = np.array([[6, 80000, 80000, 0.0]])
DecisionTree_loaded.predict(test_data)[0]

1