# Name (as it appears on Canvas)

Hsaunyu Sean Lin

# Research Question / Hypothesis

IEEE - Fraud Detection:

use the data features to see if the transaction is fraudulent

# Load Data

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_column', 100)

from sklearn.model_selection import train_test_split
# from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier # 2/19
from sklearn.tree import DecisionTreeClassifier # 2/19
from sklearn.linear_model import PassiveAggressiveClassifier # HW3

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import *
from sklearn.impute import *
from sklearn.compose import *
from sklearn.metrics import mean_squared_error, accuracy_score

# Feature Engineering

- No exploratory data analysis (EDA) code in notebook
- All feature transformations in a Pipeline. Target transformations can be done outside of a Pipeline.
- Appropriate feature engineering for data types and algorithms types
- Appropriately handled missing values
- Each step has a comment on why it was done.

In [None]:
categorical_columns = (X.dtypes == object)

trans_num = FunctionTransformer(np.log1p, validate=False)
trans_imp_num = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=True) # impute the missing values
pipe_numerical = Pipeline([('scalar', trans_num), ('imputer', trans_imp_num)])

tran_ohe = OneHotEncoder(handle_unknown='ignore')
trans_imp_cat = SimpleImputer(strategy='most_frequent', add_indicator=True) # impute the missing values
pipe_categorical = Pipeline([('ohe', tran_ohe), ('imputer', trans_imp_cat)])


preprocessing = ColumnTransformer([('numerical features',  pipe_numerical, ~categorical_columns),
                                   ('categorical features', pipe_categorical,  categorical_columns)])


In [None]:
preprocessing = ColumnTransformer([('numerical features',  pipe_numerical, ~categorical_columns),
                                   ('categorical features', pipe_categorical,  categorical_columns)])

In [None]:
pipe = Pipeline([('preprocessing', preprocessing), ('model', SGDRegressor())])
pipe.fit(X_train, y_train)

# Fit scikit-learn model

## DecisionTreeClassifier

In [14]:
# Pipelines

# Load and split the data
from sklearn.datasets        import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, 
                                                    iris.target, 
                                                    test_size=0.2)

In [120]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline      import Pipeline
from sklearn.tree          import DecisionTreeClassifier

pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', DecisionTreeClassifier())])

In [121]:
# Visualize pipeline
# This is good idea for your Final Project

from sklearn import set_config

set_config(display='diagram')

pipe_dt 

In [18]:
from sklearn.model_selection import cross_val_score, KFold

kfold = KFold(n_splits=10, shuffle=True, random_state=42)


In [19]:
results = cross_val_score(pipe_dt, 
                          X_train,
                          y_train, 
                          cv=kfold
)

In [20]:
print(f"The mean training validation accuracy - {results.mean():.4f}")

The mean training validation accuracy - 0.8667


In [21]:
# Let's search
from sklearn.model_selection import RandomizedSearchCV

hyperparameters = dict(pca__n_components     = [1, 2, 3],
                       clf__max_depth        = range(1, 5),
                       clf__criterion        = ['gini', 'entropy'],
                       clf__min_samples_leaf = range(3, 15))

clf_rand_cv = RandomizedSearchCV(estimator=pipe_dt, 
                              param_distributions=hyperparameters, 
                              n_iter=25,
                              cv=5, 
                              n_jobs=-1,
                              verbose=False)


In [22]:
clf_rand_cv.fit(X_train, y_train)

In [23]:
clf_rand_cv.best_params_ 

{'pca__n_components': 3,
 'clf__min_samples_leaf': 4,
 'clf__max_depth': 3,
 'clf__criterion': 'gini'}

## HW5

In [2]:
# Imports
# Do NOT import anything else
import numpy as np
import pandas as pd

from sklearn.base            import BaseEstimator
from sklearn.decomposition   import PCA
from sklearn.ensemble        import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model    import LogisticRegression, RidgeClassifier
from sklearn.metrics         import f1_score # This assignment's metric
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.naive_bayes     import GaussianNB
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler
from sklearn.svm             import SVC
from sklearn.impute import SimpleImputer

In [3]:
# Helper class (you do not have to use it)
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

In [4]:
# Load data
path = "./data/"
df = pd.read_csv(path+"modified.csv", header=0)

In [113]:
df.shape

(5906, 395)

In [114]:
X = df[['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']] # card4 NAN
y = df[['isFraud']].values.ravel()

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=999, shuffle=True)

In [116]:
tran_num = FunctionTransformer(np.log1p, validate=False)
imp_num = SimpleImputer(missing_values=np.nan, strategy='median', add_indicator=True)
pipe_num = Pipeline([('scalar', tran_num), ('imputer', imp_num)])

tran_ohe = OneHotEncoder(handle_unknown='ignore')
imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent', add_indicator=True)
pipe_cat = Pipeline([('ohe', tran_ohe), ('imputer', imp_cat)])

cols_cat = (X.dtypes == object)

preprocessing = ColumnTransformer([('numerical features',  pipe_num, ~cols_cat),
                                   ('categorical features', pipe_cat,  cols_cat)])

# pipe = Pipeline([('preprocessing', preprocessing), ('model', SGDRegressor())])
# pipe.fit(X_train, y_train)

In [122]:
# TODO: Show evidence of automated cross validation search across algorithms and hyperparameters

# For the sake of time, this code will not be run. 
# It will be visually inspected for:
#  - Clarity
#  - Logic
#  - Correct use of scikit-learn idioms
#  - Breadth of search

pipe = Pipeline([('Sca', StandardScaler()),
                 ('preprocessing', preprocessing),
                 ('clf', DummyEstimator())])

search_space = [{'clf': [RandomForestClassifier(n_jobs=-1)], # Actual Estimator
                 'clf__n_estimators': np.arange(100, 1000, 150),
                 'clf__max_features': ['log2','sqrt'],
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth' : np.arange(15,25,1),
                 'clf__min_samples_leaf': np.arange(1,10,1),
                 'clf__class_weight': ['balanced','balanced_subsample',None]
                },
                
                {'clf': [ExtraTreesClassifier()],  # Actual Estimator
                 'clf__max_features': ['log2','sqrt'],
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth' : np.arange(15,25,1),
                 'clf__n_estimators': np.arange(100, 1000, 150),
                 'clf__min_samples_leaf': np.arange(1,10,1)
                },
                 ]

clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=10,
                                    cv=3, 
                                    n_jobs=-1,
                                    verbose=10,
                                   scoring='f1_weighted')

In [123]:
# Visualize pipeline
# This is good idea for your Final Project

from sklearn import set_config

set_config(display='diagram')

pipe 

In [118]:
for i in range(3):

    best_model = clf_algos_rand.fit(X_train, y_train)

    print(best_model.best_estimator_.get_params()['clf'])
    print(best_model.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:   13.2s remaining:    7.7s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:   15.2s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   19.3s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   19.8s finished
  return func(X, **(kw_args if kw_args else {}))


RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=19, max_features='log2', n_estimators=700,
                       n_jobs=-1)
0.9763639473613903
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:    7.2s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    9.4s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   10.6s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   11.7s finished
  return func(X, **(kw_args if kw_args else {}))


RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=17, max_features='log2', min_samples_leaf=3,
                       n_estimators=550, n_jobs=-1)
0.9754639684921975
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:   14.7s remaining:    8.5s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:   16.5s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   17.1s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   20.5s finished
  return func(X, **(kw_args if kw_args else {}))


RandomForestClassifier(class_weight='balanced_subsample', max_depth=15,
                       max_features='log2', n_estimators=850, n_jobs=-1)
0.9764474891165792


In [112]:
# TODO: Define pipeline with your set of final hyperparameters for all steps.
params = {'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 18,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# ExtraTreesClassifier().get_params()

pipe = Pipeline([('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                 ('clf', RandomForestClassifier(**params))])

# pipe = Pipeline([('Sca',StandardScaler()),
#                  ('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), # median
#                  ('clf', DummyEstimator())])

In [10]:
# # Expect pipe object
# assert "pipe" in dir()
# assert type(pipe) == Pipeline

In [66]:
# This code is commented out because you do not have access to the test set

# Try final model 5 times and take highest test score. 
# WARNING - DO NOT DO THIS OUTSIDE OF CLASS. We are looking at the test set mulitple times for educational purposes only. 

# Set champion to worst possible baseline
f1_test_best = 0 

for run in range(5):
    # Traing final model on all the avaible training data
    pipe.fit(X_train, y_train)
    
    # Evaluate final model on test set
#     path = "./"
#     X_test   = pd.read_csv(path+"instructor/assignment_5_X_test.csv", header=0)
#     y_test   = pd.read_csv(path+"instructor/assignment_5_y_test.csv", header=0)
#     y_test   = y_test.values.ravel()
    y_pred   = pipe.predict(X_test)
    f1_test  = f1_score(y_test, y_pred, average='weighted')
    
    # Update champion score
    if f1_test > f1_test_best:
        f1_test_best = f1_test
    
print(f"{f1_test_best:,.5f}")

assert f1_test_best >= 0.61
print("Passed Level 1 test set performance 🙂")

assert f1_test_best >= 0.66
print("Passed Level 2 test set performance 🙂")

assert f1_test_best >= 0.71
print("Passed Level 3 test set performance 🙂")

0.98184
Passed Level 1 test set performance 🙂
Passed Level 2 test set performance 🙂
Passed Level 3 test set performance 🙂


# Evaluation Metric

In [12]:
y_pred = pipe.predict(X_test)

In [13]:
mse = mean_squared_error(y_test, y_pred)
print(f"mean squared error: {mse:}")

mean squared error: 245.5752742689934


In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy score: {accuracy:}")

ValueError: Classification metrics can't handle a mix of binary and continuous targets