# Name (as it appears on Canvas)

Hsaunyu Sean Lin

# Research Question / Hypothesis

IEEE - Fraud Detection:

use the data features to see if the transaction is fraudulent

# Load Data

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_column', 100)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import *
from sklearn.metrics import f1_score, accuracy_score
from sklearn import set_config

In [3]:
# Load data
path = "./data/"
df = pd.read_csv(path + "train.csv", header=0)
if 'Unnamed: 0' in df.columns:
    df.drop(['Unnamed: 0'], axis=1)

In [4]:
# Show the first line
df.head(1)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,...,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,


In [5]:
X = df[['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']] # card4 NAN
y = df[['isFraud']].values.ravel()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

# Feature Engineering

In [8]:
tran_num = FunctionTransformer(np.log1p, validate=False)
imp_num = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=True)
pipe_num = Pipeline([('scalar', tran_num), ('imputer', imp_num)])

In [9]:
cols_cat = (X.dtypes == object)

preprocessing = ColumnTransformer([('numerical data',  pipe_num, ~cols_cat)])

In [10]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [11]:
pipe = Pipeline([('Sca', StandardScaler()),
                 ('preprocessing', preprocessing),
                 ('clf', DummyEstimator())])

set_config(display='diagram')
pipe 

# Search the Best Model

In [12]:
search_space = [{'clf': [RandomForestClassifier(n_jobs=-1)], # Actual Estimator
                 'clf__n_estimators': np.arange(100, 1000, 150),
                 'clf__max_features': ['log2','sqrt'],
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth' : np.arange(15,25,1),
                 'clf__min_samples_leaf': np.arange(1,10,1),
                 'clf__class_weight': ['balanced','balanced_subsample',None]
                },
                
                {'clf': [ExtraTreesClassifier()],  # Actual Estimator
                 'clf__max_features': ['log2','sqrt'],
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth' : np.arange(15,25,1),
                 'clf__n_estimators': np.arange(100, 1000, 150),
                 'clf__min_samples_leaf': np.arange(1,10,1)
                },
                 ]

clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=3,
                                    cv=3, 
                                    n_jobs=-1,
                                    verbose=10,
                                   scoring='f1_weighted')

In [14]:
best_model = clf_algos_rand.fit(X_train, y_train)

print(best_model.best_estimator_.get_params()['clf'])
print(best_model.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  5.3min remaining: 18.5min
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:  5.4min remaining: 10.8min
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  5.7min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:  5.7min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  5.8min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 10.1min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 10.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 10.4min finished
  return func(X, **(kw_args if kw_args else {}))


ExtraTreesClassifier(max_depth=24, max_features='log2', min_samples_leaf=4,
                     n_estimators=850)
0.948740400055581


# Fit scikit-learn model

In [15]:
params = {'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced_subsample',
 'criterion': 'entropy',
 'max_depth': 24,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 850,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

pipe = Pipeline([('Sca', StandardScaler()),
                 ('preprocessing', preprocessing),
                 ('clf', ExtraTreesClassifier(**params))])

# pipe = Pipeline([('Sca',StandardScaler()),
#                  ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), # median
#                  ('clf', DummyEstimator())])

In [16]:
pipe.fit(X_train, y_train)

  return func(X, **(kw_args if kw_args else {}))


In [17]:
y_pred = pipe.predict(X_test)

  return func(X, **(kw_args if kw_args else {}))


# Evaluation Metric

In [18]:
acc = accuracy_score(y_test, y_pred)
print(f"accuracy score: {acc:.3f}")

accuracy score: 0.878


In [19]:
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"f1 score: {f1:.3f}")

f1 score: 0.911
