# Amex fraud prediciton project:

#### In this project American Express provided tons of data in order to build a fraud prediciton model, the itention of this notebook attachded to Kedro is to do more free data exploration and test "pre-node" scripts, like data pipelines, models, etc;

# Imports

In [18]:
# Basic imports
import pandas as pd
import numpy as np

# Visuals
import seaborn as sns
from matplotlib import pyplot as plt

# Ml and FE imports
from scipy import stats
import lightgbm as lgb

# Magic load kedro
%load_ext kedro.extras.extensions.ipython

The kedro.extras.extensions.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.extras.extensions.ipython


# Functions

In [6]:
# Correlation betwen binary and continuous distributions functions
def bisserialCorr(data, x, binarY):
  # convert both variables to arrays
  xArray = np.array(data[x], dtype=int).ravel()
  binarYArray = np.array(data[binarY], dtype=int).ravel()
  
  # return biseralCorr
  return stats.pointbiserialr(binarYArray, xArray)

def kruskalDiffTest(data, x, binarY):
  # take the two numpy arrays, one where the variable is one, and one where the variable is 0
  xClass_0 = np.array(data.query(f"{binarY} == 0")[x], dtype=int).ravel()
  xClass_1 = np.array(data.query(f"{binarY} == 1")[x], dtype=int).ravel()
  
  return stats.kruskal(xClass_0, xClass_1)

def bivariateTest(data, x, binarY, showOnly=False):
  # Make biserial correlation and kruskal hypthothesis test
  biserial = biserialCorr(data, x, binarY)
  kruskal = kruskalDiffTest(data, x, binarY)
  
  # if the user only wants toi print the values
  if showOnly:
    print(f"Biserial Correlation Result: {biserial}")
    print(f"Kruskal Diff Result: {kruskal}")
    
    return None
  
  return biserial, kruskal

def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def fillna_with_lgb(data:pd.DataFrame, train_set_cols: list, col_to_fill: str) -> pd.DataFrame:
    # Instantiate lightgbm
    lg = lgb.LGBMRegressor(max_depth=-1, learning_rate=0.1, n_estimators=300)

    # Filter dataframe where variable to fill is not null
    dataNotNull = data.loc[~data[col_to_fill].isnull()]

    # Make X and Y
    X = dataNotNull[train_set_cols]
    Y = dataNotNull[col_to_fill]

    # Train the model
    lg.fit(X, Y)

    # Predict null values
    X_VAL = data[train_set_cols].loc[data[col_to_fill].isnull()]
    data.loc[data[col_to_fill].isnull(), col_to_fill] = lg.predict(X_VAL)

    return data

# Pre pipelines test area

## Logistic Regression testing

In [6]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score, log_loss
import lightgbm as lgb
import mlflow
from global_functions import amex_metric

fe_train = catalog.load("fe_train")
lr_regression = catalog.load("tuned_lr")
robust_scaler_features_names = catalog.load("robust_scaler_features_names")
min_max_scaler_features_names = catalog.load("min_max_scaler_features_names")

2022-07-26 18:35:51,114 - kedro.io.data_catalog - INFO - Loading data from `fe_train` (FeatherDataSet)...
2022-07-26 18:35:51,660 - kedro.io.data_catalog - INFO - Loading data from `tuned_lr` (PickleDataSet)...
2022-07-26 18:35:51,894 - kedro.io.data_catalog - INFO - Loading data from `robust_scaler_features_names` (CSVDataSet)...
2022-07-26 18:35:51,938 - kedro.io.data_catalog - INFO - Loading data from `min_max_scaler_features_names` (CSVDataSet)...


In [15]:
# Substitue infinities with specific value
fe_train.replace([np.inf, -np.inf], 0, inplace=True)

# Instantiate stratofoed kfold
sf = StratifiedKFold(n_splits=10, shuffle=True, random_state=32)

# Dict of metrics
metrics = {
    "amex": [],
    "f1": [],
    "auc": [],
}

# Dict to save predictions of the model
predictions = {"lr_yhat": []}

# Define X and Y
x = fe_train[[col for col in fe_train.columns if col != "target"]]
y = fe_train["target"]

for tr_index, val_index in sf.split(x, y):
    # Separate train and test
    xtr, xval = x.iloc[tr_index], x.iloc[val_index]
    ytr, yval = y.iloc[tr_index], y.iloc[val_index]

    # Fit scalers at train | Use columns defined in the past node
    r = RobustScaler()
    r.fit(xtr[robust_scaler_features_names["features"]])

    m = MinMaxScaler()
    m.fit(xtr[min_max_scaler_features_names["features"]])

    # Apply scalers on validation
    xval[robust_scaler_features_names["features"]] = r.transform(xval[robust_scaler_features_names["features"]])
    xval[min_max_scaler_features_names["features"]] = m.transform(xval[min_max_scaler_features_names["features"]])

    # Make yhat of logistic regresion
    yhat_proba_lr = lr_regression.predict_proba(xval[lr_regression.features])[:, 1]
    yhat_lr = (yhat_proba_lr > 0.5).astype(int)

    # Save predictions at a xval column
    xval["target"] = yhat_proba_lr
    xval["prediction"] = yval

    # Eval metrics
    auc = roc_auc_score(yval, yhat_proba_lr)
    f1 = f1_score(yval, yhat_lr)
    amex = amex_metric(pd.DataFrame(xval["target"]), pd.DataFrame(xval["prediction"]))

    # Save metrics and predictions
    metrics["amex"].append(amex)
    metrics["auc"].append(auc)
    metrics["f1"].append(f1)
    predictions["Yhat"] += [i for i in yhat_proba_lr]


# Convert predictions and metrics dicts to datafarames
metrics = pd.DataFrame(metrics)
predictions = pd.DataFrame(predictions)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xval[robust_scaler_features_names["features"]] = r.transform(xval[robust_scaler_features_names["features"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xval[min_max_scaler_features_names["features"]] = m.transform(xval[min_max_scaler_features_names["features"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydat

## Ensemble testing

### Model and data loading | Imports

In [3]:
# Load fe_train data
fe_train = catalog.load("fe_train")

# Load tuned Models that go into the ensemble
lr = catalog.load("tuned_lr")
lgbm = catalog.load("tuned_lgbm")

# Load robust and min max sclers features names
robust_scaler_features_names = catalog.load("robust_scaler_features_names")
min_max_scaler_features_names = catalog.load("min_max_scaler_features_names")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score
import lightgbm as lgb

2022-07-14 21:43:28,259 - kedro.io.data_catalog - INFO - Loading data from `fe_train` (FeatherDataSet)...
2022-07-14 21:43:28,561 - kedro.io.data_catalog - INFO - Loading data from `tuned_lr` (PickleDataSet)...
2022-07-14 21:43:28,626 - kedro.io.data_catalog - INFO - Loading data from `tuned_lgbm` (PickleDataSet)...
2022-07-14 21:43:28,644 - kedro.io.data_catalog - INFO - Loading data from `robust_scaler_features_names` (CSVDataSet)...
2022-07-14 21:43:28,650 - kedro.io.data_catalog - INFO - Loading data from `min_max_scaler_features_names` (CSVDataSet)...


### Ensemble tests

In [7]:
# Substitue infinities with specific value
fe_train.replace([np.inf, -np.inf], 0, inplace=True)

# Instantiate kfold
kf = KFold(n_splits=10, shuffle=True, random_state=32)

# Iterate doing kfold | Stratified Kfold | Save metrics | And 
metrics = {
    "entity": [],
    "amex": [],
    "f1": [],
    "auc": []
}

corrs = []

# Define X and Y
x = fe_train[[col for col in fe_train.columns if col != "target"]]
y = fe_train["target"]

# Instantiate matrix of zeros to use ensemble
secondLevel = np.zeros((x.shape[0], 2))

for tr_index, val_index in kf.split(x):
    # Separate train and test
    xtr, xval = x.iloc[tr_index], x.iloc[val_index]
    ytr, yval = y.iloc[tr_index], y.iloc[val_index]

    if True:
        # Fit scalers at train | Use columns defined in the past node
        r = RobustScaler()
        r.fit(xtr[robust_scaler_features_names["features"]])

        m = MinMaxScaler()
        m.fit(xtr[min_max_scaler_features_names["features"]])

        # Apply scalers on validation
        xval[robust_scaler_features_names["features"]] = r.transform(xval[robust_scaler_features_names["features"]])
        xval[min_max_scaler_features_names["features"]] = m.transform(xval[min_max_scaler_features_names["features"]])

        # Make yhat of all models
        yhat_lgbm = lgbm.predict_proba(xval)[:, 1]
        yhat_lgbm_not_proba = (yhat_lgbm > 0.5).astype(int)

        yhat_lr = lr.predict_proba(xval[lr.features])[:, 1]
        yhat_lr_not_proba = (yhat_lr > 0.5).astype(int)

        # Save yhat at matrice
        secondLevel[val_index, 0] = yhat_lgbm
        secondLevel[val_index, 1] = yhat_lr

        # Evaluate individuals models metrics
        auc_lgbm = roc_auc_score(yval, yhat_lgbm)
        auc_lr = roc_auc_score(yval, yhat_lr)

        f1_lgbm = f1_score(yval, yhat_lgbm_not_proba)
        f1_lr = f1_score(yval, yhat_lr_not_proba)

        xval["target"] = yval

        xval["prediction"] = yhat_lgbm
        amex_lgbm = amex_metric(pd.DataFrame(xval["target"]), pd.DataFrame(xval["prediction"]))
        
        xval["prediction"] = yhat_lr
        amex_lr = amex_metric(pd.DataFrame(xval["target"]), pd.DataFrame(xval["prediction"]))

        metrics["entity"].append("lgbm")
        metrics["amex"].append(amex_lgbm)
        metrics["auc"].append(auc_lgbm)
        metrics["f1"].append(f1_lgbm)

        metrics["entity"].append("lr")
        metrics["amex"].append(amex_lr)
        metrics["auc"].append(auc_lr)
        metrics["f1"].append(f1_lr)

        corrs.append(np.corrcoef(yhat_lgbm, yhat_lr))


for tr_index, val_index in kf.split(x):
    if True:
        # take xtr, xval, try, yval of matrice
        xtr, xval = secondLevel[tr_index], secondLevel[val_index]
        ytr, yval = y.iloc[tr_index], y.iloc[val_index]

        # Make secondLevelLogisticRegression | and fit it
        secondLevellr = LogisticRegression()
        secondLevellr.fit(xtr, ytr)
        yhat_proba = secondLevellr.predict_proba(xval)[:, 1]
        yhat_not_proba = (yhat_proba > 0.5).astype(int)

        # Def predictions
        xval = pd.DataFrame(xval, columns=["lgbm_preds", "lr_preds"])
        xval["prediction"] = yhat_proba
        xval["target"] = np.array(yval)

        # Eval metrics
        auc = roc_auc_score(yval, yhat_proba)
        f1 = f1_score(yval, yhat_not_proba)
        amex = amex_metric(pd.DataFrame(xval["target"]), pd.DataFrame(xval["prediction"]))

        # Save metrics
        metrics["entity"].append("ensemble")
        metrics["amex"].append(amex)
        metrics["auc"].append(auc)
        metrics["f1"].append(f1)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xval[robust_scaler_features_names["features"]] = r.transform(xval[robust_scaler_features_names["features"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xval[min_max_scaler_features_names["features"]] = m.transform(xval[min_max_scaler_features_names["features"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydat