# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz

# Importing data

In [None]:
df = pd.read_csv('/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv')
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
print(df.head())

# Data Cleaning

In [None]:
X = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]
Y = X['isFraud']
del X['isFraud']

# Eliminate columns shown to be irrelevant for analysis in the EDA
X = X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)

# Binary-encoding of labelled data in 'type'
X.loc[X.type == 'TRANSFER', 'type'] = 0
X.loc[X.type == 'CASH_OUT', 'type'] = 1
X.type = X.type.astype(int) # convert dtype('O') to dtype(int)

In [None]:
X.loc[(X.oldBalanceDest == 0) & (X.newBalanceDest == 0) & (X.amount != 0),['oldBalanceDest', 'newBalanceDest']] = - 1

In [None]:
X.loc[(X.oldBalanceOrig == 0) & (X.newBalanceOrig == 0) & (X.amount != 0),['oldBalanceOrig', 'newBalanceOrig']] = np.nan

# Feature Engineering

In [None]:
X['errorBalanceOrig'] = X.newBalanceOrig + X.amount - X.oldBalanceOrig
X['errorBalanceDest'] = X.oldBalanceDest + X.amount - X.newBalanceDest

# Split the data into training and test sets in a 80:20 ratio

In [None]:
trainX1, testX1, trainY1, testY1 = train_test_split(X, Y, test_size = 0.2, \
                                                random_state = 2702)

In [None]:
trainX1.head()

In [None]:
# Long computation in this cell (~1.8 minutes)
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights,n_jobs = 4)
probabilities = clf.fit(trainX1, trainY1).predict_proba(testX1)
print('AUPRC = {}'.format(average_precision_score(testY1, \
                                              probabilities[:, 1])))


In [None]:
X = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]
Y = X['isFraud']
del X['isFraud']
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, \
                                                random_state = 2702)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier

def drop_columns(X):
    return X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)

def encode_type(X):
    X.loc[X.type == 'TRANSFER', 'type'] = 0
    X.loc[X.type == 'CASH_OUT', 'type'] = 1
    X.type = X.type.astype(int)
    return X

def set_old_new_balance(X):
    X.loc[(X.oldBalanceDest == 0) & (X.newBalanceDest == 0) & (X.amount != 0),['oldBalanceDest', 'newBalanceDest']] = -1
    X.loc[(X.oldBalanceOrig == 0) & (X.newBalanceOrig == 0) & (X.amount != 0),['oldBalanceOrig', 'newBalanceOrig']] = np.nan
    return X

def set_error_balance(X):
    X['errorBalanceOrig'] = X.newBalanceOrig + X['amount'] - X.oldBalanceOrig
    X['errorBalanceDest'] = X.oldBalanceDest + X['amount'] - X.newBalanceDest
    return X

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', FunctionTransformer(drop_columns), ['nameOrig', 'nameDest', 'isFlaggedFraud']),
        ('encode_type', FunctionTransformer(encode_type), ['type']),
        ('set_old_new_balance', FunctionTransformer(set_old_new_balance), ['oldBalanceDest', 'newBalanceDest', 'oldBalanceOrig', 'newBalanceOrig','amount']),
        ('set_error_balance', FunctionTransformer(set_error_balance), ['newBalanceOrig', 'amount', 'oldBalanceOrig', 'oldBalanceDest', 'newBalanceDest'])
    ],remainder='passthrough')

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
# Fit and transform the data using the pipeline
pipeline.fit(trainX, trainY)
trainX_transformed = pipeline.transform(trainX)
testX_transformed = pipeline.transform(testX)



In [None]:
trainX1.shape

In [None]:
trainX_transformed[0].shape

In [None]:
import pickle



# Save the classifier and pipeline to a file
with open('classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
