In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import boto3
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import ScriptProcessor, SKLearnProcessor
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.steps import ProcessingStep, TrainingStep

In [None]:
def upload_folder(local_path, bucket, prefix):
    s3_client = boto3.client('s3')
    for root, dirs, files in os.walk(local_path):
        for file in files:
            s3_client.upload_file(
              os.path.join(root, file), bucket, prefix + '/' + file)

In [None]:
RAW_DATA_PATH = '../../Data/ieee-fraud-detection'
PROC_DATA_PATH = './proc_data'

In [None]:
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
default_bucket = sagemaker_session.default_bucket()
prefix = 'ieee-fraud-detection'
role = 'AmazonSageMaker-ExecutionRole-20210114T163887' # sagemaker.get_execution_role()

In [None]:
%%time
upload_folder(RAW_DATA_PATH, default_bucket, prefix + '/raw_data')

In [None]:
input_data_uri = f's3://{default_bucket}/{prefix}/raw_data'

processing_instance_count = ParameterInteger(
    name='ProcessingInstanceCount',
    default_value=1
)
processing_instance_type = ParameterString(
    name='ProcessingInstanceType',
    default_value='ml.m5.2xlarge'
)
input_data = ParameterString(
    name='InputData',
    default_value=input_data_uri
)
training_instance_type = ParameterString(
    name='TrainingInstanceType',
    default_value='ml.m5.2xlarge'
)

In [None]:
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name='ifd-sklearn-process'
)

In [None]:
step_process = ProcessingStep(
    name='ifd-split_and_preprocess',
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=input_data, destination='/opt/ml/processing/input'),  
    ],
    outputs=[
        ProcessingOutput(output_name='train', source='/opt/ml/processing/train'),
        ProcessingOutput(output_name='valid', source='/opt/ml/processing/valid'),
        ProcessingOutput(output_name='test', source='/opt/ml/processing/test')
    ],
    code='scripts/preprocessing.py'
)

In [None]:
model_output_uri = f's3://{default_bucket}/{prefix}/models'
image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-1',
    py_version='py3',
    instance_type=training_instance_type
)

clf = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)
clf.set_hyperparameters(
    booster='gbtree',
    verbosity=0,
    objective='binary:logistic',
    seed=42,
    max_depth=6,
    eta=0.3,
    gamma=0.0,
    min_child_weight=1.0,
    subsample=1.0,
    colsample_bytree=1.0,
    scale_pos_weight=1.0,
    eval_metric='auc',
    num_round=1000,
    early_stopping_rounds=10,
    verbose_eval=False
)

In [None]:
step_train = TrainingStep(
    name='ifd-train',
    estimator=clf,
    inputs={
        'train': TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'validation': TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs['valid'].S3Output.S3Uri,
            content_type='text/csv'
        )
    }
)

In [None]:
script_processor = ScriptProcessor(
    role=role,
    image_uri=image_uri,
    command=['python3'],
    instance_count=1,
    instance_type=processing_instance_type,
    base_job_name='ifd-script-eval'
)

In [None]:
evaluation_report = PropertyFile(
    name='ifd-evaluation-report',
    output_name='evaluation',
    path='evaluation.json'
)

step_eval = ProcessingStep(
    name='ifd-eval',
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/models'
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,
            destination='/opt/ml/processing/test'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='evaluation', source='/opt/ml/processing/evaluation'),
    ],
    code='scripts/evaluation.py',
    property_files=[evaluation_report],
)

In [None]:
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')
s3_input_valid = TrainingInput(s3_data='s3://{}/{}/valid/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')


In [None]:
clf.fit(inputs={
        'train': s3_input_train
        ,
        'validation': s3_input_valid
    
    })

In [None]:
hyperparameter_ranges = {'max_depth': IntegerParameter(1, 10),
                         'eta': ContinuousParameter(0.01, 1.0),
                         'gamma': ContinuousParameter(0.0, 1.0),
                         'min_child_weight': ContinuousParameter(1e-06, 1000),
                         'colsample_bytree': ContinuousParameter(0.1, 1.0)}

In [None]:
tuner = HyperparameterTuner(xgb_clf,
                            'validation:auc',
                            hyperparameter_ranges,
                            objective_type='Maximize',
                            max_jobs=20,
                            max_parallel_jobs=3,
                            base_tuning_job_name='ifd-xgboost-hpo',
                            early_stopping_type='Auto')

In [None]:
from sagemaker.inputs import TrainingInput

In [None]:
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')
s3_input_valid = TrainingInput(s3_data='s3://{}/{}/valid/'.format(default_bucket, 'ifd-sklearn-process-2021-04-29-02-26-39-184/output'), content_type='csv')

tuner.fit({'train': s3_input_train, 'validation': s3_input_valid}, include_cls_metadata=False)

In [None]:
tuner.estimator.

In [None]:
sklearn_processor.run(
    code='ieee-fraud-detection/preprocessing.py',
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/valid"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
    ]
)

In [None]:
step_process = ProcessingStep(
    name='IFD-Split_and_Preprocess',
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=input_data, destination='/opt/ml/processing/input'),  
    ],
    outputs=[
        ProcessingOutput(output_name='train', source='/opt/ml/processing/train'),
        ProcessingOutput(output_name='valid', source='/opt/ml/processing/valid')
    ],
    code='ieee-fraud-detection/preprocessing.py'
)

In [None]:
s3 = boto3.resource('s3')

base_uri = f's3://{default_bucket}/ieee-fraud-detection'
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=DATA_DIR, 
    desired_s3_uri=base_uri,
)
print(input_data_uri)

In [None]:
import multiprocessing
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from utils.ordinal_encoder import OrdinalEncoder

In [None]:
def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

In [None]:
DATA_DIR = '../../../../Data/ieee-fraud-detection'
RANDOM_STATE = 42

In [None]:
train_identity = pd.read_csv(os.path.join(local_data_path, 'train_identity.csv'))
train_transaction = pd.read_csv(os.path.join(local_data_path, 'train_transaction.csv')) 

df_train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

In [None]:
cat_features = pd.Index(['ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'DeviceInfo'] + \
[f'card{i}' for i in range(1, 7)] + [f'M{i}' for i in range(1, 10)] + [f'id_{i}' for i in range(12, 39)])
num_features = df_train.columns.difference(pd.Index(['TransactionID', 'TransactionDT', 'isFraud']) | cat_features)
all_features = cat_features | num_features

In [None]:
def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))


int_cat_features =  df_train[cat_features].select_dtypes('number').columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)

In [None]:
df_train[cat_features] = df_train[cat_features].astype('str')

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.2, random_state=42, stratify=df_train['isFraud'])

In [None]:
# n_jobs = int(0.75 * multiprocessing.cpu_count())
cat_pipeline = make_pipeline(OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), SimpleImputer(strategy='constant', fill_value=-1))
num_pipeline = make_pipeline(SimpleImputer(strategy='median'))
transformer = make_column_transformer((cat_pipeline, cat_features), (num_pipeline, num_features))

X_train = transformer.fit_transform(df_X_train)
X_valid = transformer.transform(df_X_valid)

df_train = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
df_valid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)

In [None]:
df_train = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
df_valid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)

In [None]:
lgb_dtrain = lgb.Dataset(X_train, label=df_y_train)
lgb_dvalid = lgb.Dataset(X_valid, label=df_y_valid, reference=lgb_dtrain)

In [None]:
params = {
    'metric': 'auc'
    , 'boosting_type': 'gbdt'
    , 'num_leaves': 31
    , 'max_depth': -1
    , 'learning_rate': 0.1
    , 'objective': 'binary'
    , 'is_unbalance': True
    , 'min_split_gain': 0 
    , 'min_sum_hessian_in_leaf': 1e-03
    , 'min_child_samples': 20
    , 'subsample': 1
    , 'subsample_freq': 0
    , 'colsample_bytree': 1
    , 'reg_alpha': 0
    , 'reg_lambda': 0
    , 'n_jobs': -1
    , 'random_state': 42
#    , 'device': 'gpu'
}

In [None]:
num_boost_round = 1000
early_stopping_rounds = 10
evals_result = {}
lgb_clf = lgb.train(params, lgb_dtrain, num_boost_round=num_boost_round, valid_sets=[lgb_dtrain, lgb_dvalid], 
                    valid_names=['train', 'valid'], early_stopping_rounds=early_stopping_rounds, 
                    evals_result=evals_result, verbose_eval=0)

In [None]:
import boto3
import sagemaker


region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = 'AmazonSageMaker-ExecutionRole-20210114T163887' # sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

In [None]:
s3 = boto3.resource('s3')

base_uri = f"s3://{default_bucket}/ieee-fraud-detection"
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=DATA_DIR, 
    desired_s3_uri=base_uri,
)
print(input_data_uri)

In [None]:
base_uri = f"s3://{default_bucket}/ieee-fraud-detection/data"

In [None]:
def uploadDirectory(path,bucketname):
    s3C = boto3.client('s3')
    for root,dirs,files in os.walk(path):
        for file in files:
            s3C.upload_file(os.path.join(root,file),bucketname,file)

In [None]:
uploadDirectory(DATA_DIR, base_uri)

In [None]:
s3C = boto3.client('s3')

In [None]:
list(os.walk('../../../../Data'))

In [None]:
DATA_DIR