In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import dill as pickle
from scipy.stats import randint, uniform

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier, DMatrix, Booster
from sklearn_pandas import DataFrameMapper

In [2]:
df = pd.read_csv("../data/PS_20174392719_1491204439457_log.csv")
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [3]:
features = ['type', 'nameOrig', 'nameDest', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
X_train, X_test, y_train, y_test = train_test_split(df, df["isFraud"], stratify=df["step"], test_size=0.33, random_state=42)

In [4]:
pipe = pickle.load(open("../models/xgb_model.pkl", 'rb'))
pipe

Pipeline(steps=[('dataframemapper',
                 DataFrameMapper(features=[('type',
                                            TransformerPipeline(steps=[('functiontransformer',
                                                                        FunctionTransformer(func=<function reshape at 0x107d0b550>)),
                                                                       ('labelbinarizer',
                                                                        LabelBinarizer())]),
                                            {}),
                                           ('amount',
                                            TransformerPipeline(steps=[('functiontransformer',
                                                                        FunctionTransformer(func=<function reshape at 0x107d0b550>)),
                                                                       ('standardsca...
                               colsample_bytree=1, gamma=0, gpu_id=-1,
         

In [5]:
ct=pipe.steps[0][1]
clf = pipe.steps[1][1]

In [6]:
y = X_train["isFraud"]
X = ct.fit_transform(X_train)

In [7]:
mat = DMatrix(X)
s = [clf.get_booster().predict(mat, ntree_limit=i) for i in range(1, 122)]

In [8]:
s2 = [s[0]] + [s[i] - s[i-1] for i in range(1, 121)] # undo the cumulative sum

In [9]:
scores = np.vstack(s2).T

In [10]:
scores.shape

(4262955, 121)

In [11]:
np.save(open("xgb_scores.mat", 'wb'), scores)

In [12]:
X_train["isFlaggedFraud"].values.shape

(4262955,)

In [13]:
meta_X = np.hstack([scores, X_train["isFlaggedFraud"].values.reshape(-1, 1)])

In [14]:
meta_X.shape

(4262955, 122)

In [None]:
#clf1 = XGBClassifier(learning_rate=0.10867386642568048, max_depth=9, n_estimators=122).fit(meta_X, y)



In [18]:
clf2 = XGBClassifier(learning_rate=0.10867386642568048, max_depth=9, n_estimators=122).fit(scores, y)



In [26]:
lr1 = LogisticRegressionCV(n_jobs=-1, max_iter=3000).fit(scores, y)

KeyboardInterrupt: 

In [None]:
lr2 = LogisticRegressionCV(n_jobs=-1,max_iter=3000).fit(meta_X, y)

In [17]:
lin = LinearRegression().fit(scores, X_train["amount"])

In [21]:
def get_stack(Xt):
    mat = DMatrix(Xt)
    s = [clf.get_booster().predict(mat, ntree_limit=i) for i in range(1, 122)]
    s2 = [s[0]] + [s[i] - s[i-1] for i in range(1, 121)] # undo the cumulative sum
    return np.vstack(s2).T

X2 = ct.fit_transform(X_test)
X3 = get_stack(X2)
meta_X_test = np.hstack([X3, X_test["isFlaggedFraud"].values.reshape(-1, 1)])

In [22]:
print(metrics.classification_report(y_test, clf2.predict(X3)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096966
           1       0.00      0.00      0.00      2699

    accuracy                           1.00   2099665
   macro avg       0.50      0.50      0.50   2099665
weighted avg       1.00      1.00      1.00   2099665



In [23]:
print(metrics.classification_report(y_test, lr1.predict(X3)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096966
           1       1.00      0.11      0.20      2699

    accuracy                           1.00   2099665
   macro avg       1.00      0.56      0.60   2099665
weighted avg       1.00      1.00      1.00   2099665



In [24]:
print(metrics.classification_report(y_test, lr2.predict(meta_X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096966
           1       1.00      0.11      0.20      2699

    accuracy                           1.00   2099665
   macro avg       1.00      0.56      0.60   2099665
weighted avg       1.00      1.00      1.00   2099665



In [28]:
print(metrics.mean_squared_error(X_test["amount"], lin.predict(X3)))

354497654612.2053
