In [None]:
from utils import *
print_version()
cfg = load_config('default')
random_seed = cfg['random_seed']
tf.random.set_seed(cfg['tf_seed'])
np.random.seed(cfg['np_seed'])
data_x, data_y = load_credit_card_dataset()
scaled_x = standard_scale(data_x)

test_split_clm = load_config("supervised.test_split")
x_train, x_test, y_train, y_test = train_test_split(scaled_x, data_y, stratify=data_y, **test_split_clm)

In [None]:
# Number of negative and positive examples
np.sum(y_train==0),np.sum(y_train==1)

In [None]:
# 訓練セットの90％をDrop
drop_idx = y_train[y_train==1].sample(frac=0.90, random_state=random_seed).index
x_train.drop(labels=drop_idx, inplace=True)
y_train.drop(labels=drop_idx, inplace=True)

In [None]:
# Shape of data
print([each.shape for each in (x_train, x_test, y_train, y_test)])

In [None]:
# Check the number of fruadulant cases left after dropping 90%
np.sum(y_train==0),np.sum(y_train==1)

In [None]:
# 9.2 教師ありモデル

# k分割交差検証
k_fold_prm = load_config('supervised.k_fold')
#　k-foldは，`StratifiedKFold`　を使用しているため，y_trainのラベル分布が近くなるように分割する．
k_fold = StratifiedKFold(**k_fold_prm)
predictions_based_on_k_folds = pd.DataFrame(
    data=[], index=y_train.index, columns=['prediction']
)
# 学習
training_scores = [] # 学習率
validation_scores = [] # 検証スコア

# x_trainの値の分布を考慮せずに分割して欲しいため，0埋めした配列を`split()`の引数にいれ，そのランダムなインデックスのリストを取得する．
for train_idx, validation_idx in k_fold.split(np.zeros(len(x_train)), y_train.ravel()):
    x_train_fold, x_validation_fold = x_train.iloc[train_idx, :], x_train.iloc[validation_idx, :] # 2次元配列
    y_train_fold, y_validation_fold = y_train.iloc[train_idx], y_train.iloc[validation_idx] # １次元配列

    lgb_train = lgb.Dataset(x_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(x_validation_fold, y_validation_fold, reference=lgb_train)
    lgb_params = load_config('supervised.lgb')
    gbm = lgb.train(lgb_params, lgb_train,
                         num_boost_round=2000,
                         valid_sets=lgb_eval,
                         callbacks=[lgb.early_stopping(200)]
                         )

    y_train_predict = gbm.predict(x_train_fold, num_iteration=gbm.best_iteration)
    training_score = log_loss(y_train_fold, y_train_predict)

    y_validation_predict = gbm.predict(x_validation_fold, num_iteration=gbm.best_iteration)
    predictions_based_on_k_folds.loc[x_validation_fold.index, 'prediction'] = y_validation_predict
    validation_score = log_loss(y_validation_fold, y_validation_predict)


    print(f"training log loss:  {training_score}")
    print(f"validation log loss: {validation_score}")
    training_scores.append(training_score)
    validation_scores.append(validation_score)

In [None]:
log_loss_light_gbm_gradient_boosting = log_loss(y_train, predictions_based_on_k_folds.loc[:, 'prediction'])
print(f'Light GBM Gradient Boosting Log loss: {log_loss_light_gbm_gradient_boosting}')

In [None]:
_, average_precision = plot_results(y_train, predictions_based_on_k_folds.loc[:,'prediction'], True)
print(f"average_precision: {average_precision}")

In [None]:
# Evaluate on Test Set
predictions = pd.Series(data=gbm.predict(x_test,
                                         num_iteration=gbm.best_iteration), index=x_test.index)
preds, average_precision = plot_results(y_test, predictions, True)

In [None]:
# Calculate precision at 75% recall
preds, precision = precision_analysis(preds, "anomaly_score", 0.75)
print(f'Precision at 75% recall {round(precision,4)}')

In [None]:
# 9.3 教師なしモデル
over_sample_multiplier: int = load_config('unsupervised.over_sample_multiplier')
x_train_oversampled = x_train.copy()
y_train_oversampled = y_train.copy()
x_train_oversampled = x_train_oversampled.append([x_train_oversampled[y_train==1]] * over_sample_multiplier, ignore_index=False)
y_train_oversampled = y_train_oversampled.append([y_train_oversampled[y_train==1]] * over_sample_multiplier, ignore_index=False)
# View shape
x_train_oversampled.shape, y_train_oversampled.shape

In [None]:
model = keras.Sequential([
    Dense(units=40, activation='linear', activity_regularizer=regularizers.l1(10e-5), input_dim=29, name='hidden_layer'),
    Dropout(0.02),
    Dense(units=29, activation='linear')
])
model.summary()

In [None]:
compile_prm = load_config('unsupervised.compile')
model.compile(**compile_prm)
fit_prm = load_config('unsupervised.fit')
x_illegal = x_train_oversampled[y_train_oversampled==0]
history = model.fit(x=x_illegal, y=x_illegal, **fit_prm)

In [None]:
x_illegal

In [None]:
predictions_train = model.predict(x_train, verbose=1)
annomaly_scores_ae_train = anomaly_scores(x_train, predictions_train)
preds, average_precision = plot_results(y_train, annomaly_scores_ae_train, True)

In [None]:
annomaly_scores_ae_train

In [None]:
# Evaluate on test set
predictions = model.predict(x_test, verbose=1)
anomaly_scores_ae = anomaly_scores(x_test, predictions)
preds, average_precision = plot_results(y_test, anomaly_scores_ae, True)

In [None]:
# Calculate precision at 75% recall
preds, precision = precision_analysis(preds, "anomaly_score", 0.75)
print(f'Precision at 75% recall {round(precision,4)}')

In [None]:
# 9.4 半教師ありモデル

# 入力が29，出力が40
intermediate_model = keras.Model(inputs=model.input,
                                 outputs=model.get_layer("hidden_layer").output)
intermediate_output_train = intermediate_model.predict(x_train)
intermediate_output_test = intermediate_model.predict(x_test)

intermediate_output_train_df = pd.DataFrame(data=intermediate_output_train, index=x_train.index)
intermediate_output_test_df = pd.DataFrame(data=intermediate_output_test, index=x_test.index)

# with_ae は元のデータセットの２９個の特徴量と，オートエンコーダ由来の４０個の表現を併せ持つ
x_train_with_ae = x_train.merge(intermediate_output_train_df, left_index=True, right_index=True)
x_test_with_ae = x_test.merge(intermediate_output_test_df, left_index=True, right_index=True)
y_train_with_ae = y_train.copy()
x_train_with_ae.shape, y_train_with_ae.shape, x_train_with_ae.columns

In [None]:
# 学習
training_scores = [] # 学習率
validation_scores = [] # 検証スコア

predictions_based_on_k_folds = pd.DataFrame(
    data=[], index=y_train_with_ae.index, columns=['prediction']
)
for train_idx, validation_idx in k_fold.split(np.zeros(len(x_train_with_ae)), y_train_with_ae.ravel()):
    x_train_fold, x_validation_fold = x_train_with_ae.iloc[train_idx, :], x_train_with_ae.iloc[validation_idx, :] # 2次元配列
    y_train_fold, y_validation_fold = y_train_with_ae.iloc[train_idx], y_train_with_ae.iloc[validation_idx] # １次元配列

    lgb_train = lgb.Dataset(x_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(x_validation_fold, y_validation_fold, reference=lgb_train)
    lgb_params = load_config('supervised.lgb')
    gbm = lgb.train(lgb_params, lgb_train,
                    num_boost_round=2000,
                    valid_sets=lgb_eval,
                    callbacks=[lgb.early_stopping(200)]
                    )

    y_train_predict = gbm.predict(x_train_fold, num_iteration=gbm.best_iteration)
    training_score = log_loss(y_train_fold, y_train_predict)

    y_validation_predict = gbm.predict(x_validation_fold, num_iteration=gbm.best_iteration)
    predictions_based_on_k_folds.loc[x_validation_fold.index, 'prediction'] = y_validation_predict
    validation_score = log_loss(y_validation_fold, y_validation_predict)


    print(f"training log loss:  {training_score}")
    print(f"validation log loss: {validation_score}")
    training_scores.append(training_score)
    validation_scores.append(validation_score)

In [None]:
# Print results
log_loss_light_gbm_gradient_boosting = log_loss(y_train_with_ae, predictions_based_on_k_folds.loc[:,'prediction'])
print(f'LightGBM Gradient Boosting Log Loss: {round(log_loss_light_gbm_gradient_boosting, 4)}')

In [None]:
preds, average_precision = plot_results(y_train_with_ae, predictions_based_on_k_folds.loc[:,'prediction'], True)

In [None]:
# Evaluate results on test set
predictions = pd.Series(data=gbm.predict(x_test_with_ae, num_iteration=gbm.best_iteration),index=x_test_with_ae.index)
preds, average_precision = plot_results(y_test, predictions, True)

In [None]:
# Calculate precision at 75% recall
preds, precision = precision_analysis(preds, "anomaly_score", 0.75)
print(f'{round(precision,4)}')

In [None]:
# Analyze most important features
featuresImportance = pd.DataFrame(data=list(gbm.feature_importance()), index=x_train_with_ae.columns,columns=['featImportance'])
featuresImportance = featuresImportance/featuresImportance.sum()
featuresImportance.sort_values(by='featImportance', ascending=False,inplace=True)
featuresImportance

In [None]:
print([each.shape for each in (x_train, x_test, y_train, y_test)])
