In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ! pip install sweetviz

# import sweetviz as sv
# import IPython

In [None]:
!pip install imbalanced-learn

In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from collections import Counter
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay

In [None]:
import gc
import os
import sys

from keras import applications
from keras import backend as K
from keras import layers
from keras import models
from keras import optimizers
import tensorflow as tf

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.compat.v1.keras import backend as K
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model

# Loading the Data

In [None]:
df_dandoww = pd.read_csv('../input/project/dando_ww_cleaned_data.txt')
df_dandoww.head()

# Dropping features

In [None]:
df_dandoww.drop(['LifetimePolicyReference', 'PolicyReference', 'PriorPolicyRef', 
               'EclipsePolicyID', 'PriorEclipsePolicyID', 'GroupClass',
               'StatsMinorClassDescription', 'InceptionDate', 'ExpiryDate', 
               'RenewalDate', 'SubClassCode'], axis=1, inplace=True)
df_dandoww.head()

In [None]:
df_dandoww.drop(['Exposure', 'LinePct'], axis=1, inplace=True)
df_dandoww.head()

In [None]:
# Since claims won’t be mature enough
df_dandoww.drop(df_dandoww[df_dandoww.YOA == 2020.0].index, inplace=True)
df_dandoww.drop(df_dandoww[df_dandoww.YOA == 2021.0].index, inplace=True)
df_dandoww.reset_index(drop=True, inplace=True)

In [None]:
df_dandoww.drop(df_dandoww[df_dandoww.PlacingBasis == 'Binder'].index, axis=0, inplace=True)
df_dandoww.drop(df_dandoww[df_dandoww.PlacingBasis == 'Other'].index, axis=0, inplace=True)
df_dandoww.reset_index(drop=True, inplace=True)

In [None]:
df_dandoww.replace(' ', '_', regex=True, inplace=True)
df_dandoww.head()

# Classification

In [None]:
df_dandoww.drop(['Inc', 'Inc_Cat', 'ClaimFrequency', 'CLR_Cat', 
                'CLR_ExCat', 'Full_Inc_ExCat', 'Inc_ExCat', 
                 'ILR_Cat', 'ILR_ExCat', 'GNWP'], axis=1, inplace=True)
df_dandoww.head()

In [None]:
# Making 100% share basis
df_dandoww.GGTP = df_dandoww.GGTP / df_dandoww.EffectiveLine
df_dandoww.GGWP = df_dandoww.GGWP / df_dandoww.EffectiveLine
df_dandoww.GrossGrossModelPrice = df_dandoww.GrossGrossModelPrice / df_dandoww.EffectiveLine
df_dandoww.GrossNetModelPrice = df_dandoww.GrossNetModelPrice / df_dandoww.EffectiveLine
df_dandoww.GrossGrossTechnicalPrice = df_dandoww.GrossGrossTechnicalPrice / df_dandoww.EffectiveLine
df_dandoww.GrossNetTechnicalPrice = df_dandoww.GrossNetTechnicalPrice / df_dandoww.EffectiveLine

In [None]:
# Rename the columns
df_dandoww = df_dandoww.rename(columns={'GGTP': 'Full_GGTP'})
df_dandoww = df_dandoww.rename(columns={'GGWP': 'Full_GGWP'})
df_dandoww = df_dandoww.rename(columns={'GrossGrossModelPrice': 'Full_GrossGrossModelPrice'})
df_dandoww = df_dandoww.rename(columns={'GrossNetModelPrice': 'Full_GrossNetModelPrice'})
df_dandoww = df_dandoww.rename(columns={'GrossGrossTechnicalPrice': 'Full_GrossGrossTechnicalPrice'})
df_dandoww = df_dandoww.rename(columns={'GrossNetTechnicalPrice': 'Full_GrossNetTechnicalPrice'})

In [None]:
df_dandoww.drop('EffectiveLine', axis=1, inplace=True)

In [None]:
X = df_dandoww.copy()
X.head()

In [None]:
df_dandoww.drop(['YOA', 'YOA_cat', 'YOA_recent'], axis=1, inplace=True)
X = X.drop(['YOA', 'YOA_cat', 'YOA_recent'], axis=1).copy()

In [None]:
X['Class'] = X['ClaimCount'] / X['ClaimCount']
X['Class'] = X['Class'].fillna(0)
X.head()

In [None]:
df_dandoww.drop('ClaimCount', axis=1, inplace=True)
X.drop('ClaimCount', axis=1, inplace=True)

In [None]:
# Divide into primary and excess business
X_p = X.copy()
X_p.drop(X_p[X_p.Excess == 'Excess'].index, axis=0, inplace=True)
X_p.drop(['Excess', 'XS_100Pct_USD'], axis=1, inplace=True)
X_p.reset_index(drop=True, inplace=True)
X_p.head()

In [None]:
# Divide into primary and excess business
X_e = X.copy()
X_e.drop(X_e[X_e.Excess == 'Primary'].index, axis=0, inplace=True)
X_e.drop('Excess', axis=1, inplace=True)
X_e.reset_index(drop=True, inplace=True)
X_e.head()

In [None]:
# Data scaling
# Scale only columns that have values greater than 1
to_scale = [col for col in X.columns[X.dtypes != 'object'] if X[col].max() > 1]
mms = MinMaxScaler()
scaled = mms.fit_transform(X[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

X_scaled = X.copy()
# Replace original columns with scaled ones
for col in scaled:
    X_scaled[col] = scaled[col]
    
X_scaled.head()

In [None]:
# Divide into primary and excess business
X_p_scaled = X_scaled.copy()
X_p_scaled.drop(X_p_scaled[X_p_scaled.Excess == 'Excess'].index, axis=0, inplace=True)
X_p_scaled.drop('Excess', axis=1, inplace=True)
X_p_scaled.reset_index(drop=True, inplace=True)
# X_p_scaled.head()

X_e_scaled = X_scaled.copy()
X_e_scaled.drop(X_e_scaled[X_e_scaled.Excess == 'Primary'].index, axis=0, inplace=True)
X_e_scaled.drop('Excess', axis=1, inplace=True)
X_e_scaled.reset_index(drop=True, inplace=True)
# X_e_scaled.head()

### One-hot encoding

In [None]:
X_encoded = pd.get_dummies(X_scaled, columns = ['PlacingBasis', 'SubClass', 'StatsMinorClassCode', 
                                        'Territory', 'LeaderStatus', 'BrokerUltimateName', 
                                         'Excess', 'PLR_band', 'PLR_band_ex_adj'])
X_encoded_p = pd.get_dummies(X_p_scaled, columns = ['PlacingBasis', 'SubClass', 'StatsMinorClassCode', 
                                        'Territory', 'LeaderStatus', 'BrokerUltimateName', 'PLR_band', 
                                             'PLR_band_ex_adj'])
X_encoded_e = pd.get_dummies(X_e_scaled, columns = ['PlacingBasis', 'SubClass', 'StatsMinorClassCode', 
                                        'Territory', 'LeaderStatus', 'BrokerUltimateName', 'PLR_band', 
                                            'PLR_band_ex_adj'])

In [None]:
y = X_scaled['Class'].copy()
y_p = X_p_scaled['Class'].copy()
y_e = X_e_scaled['Class'].copy()

In [None]:
X_encoded.drop('Class', axis=1, inplace=True)
X_encoded_p.drop('Class', axis=1, inplace=True)
X_encoded_e.drop('Class', axis=1, inplace=True)

### EDA

In [None]:
X.describe()

In [None]:
plt.bar(dict(Counter(y)).keys(), dict(Counter(y)).values())
plt.xticks(np.arange(2))
plt.title('Class of D&O WW')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# report = sv.analyze(X_scaled, 'Class')
# report.show_html(filepath='D&O WW.html',
#                  open_browser=True,
#                  layout='vertical',
#                  scale=None)

In [None]:
# X[["Excess", "Class"]].groupby(['Excess'], as_index=False).mean().sort_values(
#     by='Class', ascending=False)

In [None]:
# X[["Territory", "Class"]].groupby(['Territory'], as_index=False).mean().sort_values(
#     by='Class', ascending=False)

In [None]:
# X_e_scaled[["BrokerUltimateName", "Class"]].groupby(['BrokerUltimateName'], as_index=False).mean().sort_values(
#     by='Class', ascending=False)

#### Primary Business

In [None]:
X_p.describe()

In [None]:
X_p.skew()

In [None]:
plt.bar(dict(Counter(y_p)).keys(), dict(Counter(y_p)).values())
plt.xticks(np.arange(2))
plt.title('Class of Primary Business')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Looking at the skewness of Full_GrossGrossModelPrice
plt.figure(figsize=(7,5))
sns.distplot(X_p.Full_GrossGrossModelPrice, hist=False)
plt.title(f"Gross Gross Model Price")
plt.xlim(0, X_p.Full_GrossGrossModelPrice.max()+1)
plt.tight_layout()
plt.show()

# Log-transform
plt.figure(figsize=(7,5))
sns.distplot(np.log1p(X_p.Full_GrossGrossModelPrice))
plt.title(f"Log Gross Gross Model Price")
plt.xlim(0, np.log1p(X_p.Full_GrossGrossModelPrice).max()+1)
plt.tight_layout()
plt.show()

In [None]:
# Looking at the skewness of Full_GrossNetModelPrice
plt.figure(figsize=(7,5))
sns.distplot(X_p.Full_GrossNetModelPrice, hist=False)
plt.title(f"Gross Gross Model Price is highly skewed")
# plt.ylabel("Count of readings")
# plt.xlabel(f"Measured consumption")
plt.xlim(0, X_p.Full_GrossNetModelPrice.max()+1)
plt.tight_layout()
plt.show()

# Log transform
plt.figure(figsize=(7,5))
sns.distplot(np.log1p(X_p.Full_GrossNetModelPrice))
plt.title(f"Log Gross Gross Model Price looks to be more normal")
# plt.ylabel("Count of readings")
# plt.xlabel(f"Measured consumption")
plt.xlim(0, np.log1p(X_p.Full_GrossNetModelPrice).max()+1)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(X_p.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
# report_p = sv.analyze(X_p_scaled, 'Class')
# report_p.show_html(filepath='D&O WW Primary.html',
#                  open_browser=True,
#                  layout='vertical',
#                  scale=None)

#### Excess Business

In [None]:
X_e.describe()

In [None]:
X_e.skew()

In [None]:
plt.bar(dict(Counter(y_e)).keys(), dict(Counter(y_e)).values())
plt.xticks(np.arange(2))
plt.title('Class of Excess Business')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Looking at the skewness of Full_GGTP
plt.figure(figsize=(7,5))
sns.distplot(X_e.Full_GGTP, hist=False)
plt.title(f"Gross Gross Model Price")
plt.xlim(0, X_e.Full_GGTP.max()+1)
plt.tight_layout()
plt.show()

# Log transform
plt.figure(figsize=(7,5))
sns.distplot(np.log1p(X_e.Full_GGTP))
plt.title(f"Log Gross Gross Model Price")
plt.xlim(0, np.log1p(X_e.Full_GGTP).max()+1)
plt.tight_layout()
plt.show()

In [None]:
# Looking at the skewness of Full_GGTP
plt.figure(figsize=(7,5))
sns.distplot(X_e.Full_GrossGrossTechnicalPrice, hist=False)
plt.title(f"Full Gross Gross Technical Price")
plt.xlim(0, X_e.Full_GrossGrossTechnicalPrice.max()+1)
plt.tight_layout()
plt.show()

# Log transform
plt.figure(figsize=(7,5))
sns.distplot(np.log1p(X_e.Full_GrossGrossTechnicalPrice))
plt.title(f"Log Full Gross Gross Model Price")
plt.xlim(0, np.log1p(X_e.Full_GrossGrossTechnicalPrice).max()+1)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(X_e.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
# report_e = sv.analyze(X_e_scaled, 'Class')
# report_e.show_html(filepath='D&O WW Excess.html',
#                  open_browser=True,
#                  layout='vertical',
#                  scale=None)

### Changing the data into log-transform

In [None]:
# Primary business
X_p['Full_GrossGrossModelPrice'] = np.log1p(X_p.Full_GrossGrossModelPrice)
X_p['Full_GrossNetModelPrice'] = np.log1p(X_p.Full_GrossNetModelPrice)

# Excess business
X_e['Full_GrossGrossModelPrice'] = np.log1p(X_e.Full_GrossGrossModelPrice)
X_e['Full_GrossNetModelPrice'] = np.log1p(X_e.Full_GrossNetModelPrice)
X_e['Full_GrossGrossTechnicalPrice'] = np.log1p(X_e.Full_GrossGrossTechnicalPrice)
X_e['Full_GrossNetTechnicalPrice'] = np.log1p(X_e.Full_GrossNetTechnicalPrice)
X_e['Ded_100Pct_USD'] = np.log1p(X_e.Ded_100Pct_USD)
X_e['Full_GGTP'] = np.log1p(X_e.Full_GGTP)
X_e['RARC'] = np.log1p(X_e.RARC)

In [None]:
# Dropping highly correlated features
correlated_features = set()
correlation_matrix = X_p.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
print(correlated_features)
X_p.drop(labels=correlated_features, axis=1, inplace=True)
X_p.head()

In [None]:
print(correlated_features)

In [None]:
correlated_features = set()
correlation_matrix = X_e.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
print(correlated_features)
X_e.drop(labels=correlated_features, axis=1, inplace=True)
X_e.head()

In [None]:
# Data scaling
# Scale only columns that have values greater than 1
to_scale = [col for col in X_p.columns[X_p.dtypes != 'object'] if X_p[col].max() > 1]
mms = MinMaxScaler()
scaled = mms.fit_transform(X_p[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

X_p_scaled = X_p.copy()
# Replace original columns with scaled ones
for col in scaled:
    X_p_scaled[col] = scaled[col]
    
X_p_scaled.head()

In [None]:
# Data scaling
# Scale only columns that have values greater than 1
to_scale = [col for col in X_e.columns[X_e.dtypes != 'object'] if X_e[col].max() > 1]
mms = MinMaxScaler()
scaled = mms.fit_transform(X_e[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

X_e_scaled = X_e.copy()
# Replace original columns with scaled ones
for col in scaled:
    X_e_scaled[col] = scaled[col]
    
X_e_scaled.head()

In [None]:
X_encoded_p = pd.get_dummies(X_p_scaled, columns = ['PlacingBasis', 'SubClass', 'StatsMinorClassCode', 
                                        'Territory', 'LeaderStatus', 'BrokerUltimateName', 'PLR_band', 
                                             'PLR_band_ex_adj'])
X_encoded_e = pd.get_dummies(X_e_scaled, columns = ['PlacingBasis', 'SubClass', 'StatsMinorClassCode', 
                                        'Territory', 'LeaderStatus', 'BrokerUltimateName', 'PLR_band', 
                                            'PLR_band_ex_adj'])
X_encoded_p.drop('Class', axis=1, inplace=True)
X_encoded_e.drop('Class', axis=1, inplace=True)

In [None]:
len(X_encoded_p.columns)

### XGBoost

#### Primary business

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_p, y_p, test_size=0.2, 
                                                    stratify = y_p, random_state=42)
# dtrain = xgb.DMatrix(X_train, y_train, feature_names=X_encoded.columns)
# dtest = xgb.DMatrix(X_test, y_test, feature_names=X_encoded.columns)

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [4, 5, 6]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_train, y_train, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test, y_test)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.05,
                          max_depth=4,
                          reg_lambda=10,
                          scale_pos_weight=6,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test, y_test)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# Feature of importance
importance = clf_xgb.feature_importances_

# for i,v in enumerate(importance):
#     print('Feature: %0d, Score: %.5f' % (i,v))

feature = np.argsort(importance)[-10:]
score = importance[np.argsort(importance)[-10:]]

fs = zip(feature,score)
imp = tuple(fs)

print('Top 10 most important features')
for i in range(len(imp)):
    print('Feature:', X_train.columns[imp[i][0]], '\n Score: %.5f' % imp[i][1])

# plot feature importance
plt.figure(figsize=(8,6))
plt.bar([x for x in range(len(importance))], importance)
plt.title('Feature Importance')
plt.xlabel('Features')
plt.ylabel('Score')
plt.show()

In [None]:
# Dropping unimportant features
drop_cols = X_train.columns[np.where(importance == 0)]
X_encoded_p.drop(drop_cols, axis=1, inplace=True)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
roc_auc = roc_auc_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test)
probabilities = y_prob[:,1]

fpr_p, tpr_p, _ = roc_curve(y_test, probabilities)
roc_display_p = RocCurveDisplay(fpr=fpr_p, tpr=tpr_p)
roc_auc_p = auc(fpr_p, tpr_p)

prec_p, recall_p, _ = precision_recall_curve(y_test, probabilities)
pr_auc_p = auc(recall_p, prec_p)
pr_display_p = PrecisionRecallDisplay(precision=prec_p, recall=recall_p)

print('ROC AUC:', roc_auc_p)
print('Precision-Recall AUC:', pr_auc_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_p.plot(ax=ax1)
pr_display_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test, y_test, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.05,
                          max_depth=4,
                          reg_lambda=10,
                          scale_pos_weight=6,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_train, y_train)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess business

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_e, y_e, test_size=0.2, 
                                                    stratify = y_e, random_state=42)
# dtrain = xgb.DMatrix(X_train, y_train, feature_names=X_encoded.columns)
# dtest = xgb.DMatrix(X_test, y_test, feature_names=X_encoded.columns)

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_train, y_train, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test, y_test)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5
                           )
clf_xgb.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test, y_test)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# Feature of importance
importance = clf_xgb.feature_importances_

# for i,v in enumerate(importance):
#     print('Feature: %0d, Score: %.5f' % (i,v))

feature = np.argsort(importance)[-10:]
score = importance[np.argsort(importance)[-10:]]

fs = zip(feature,score)
imp = tuple(fs)

print('Top 10 most important features')
for i in range(len(imp)):
    print('Feature:', X_train.columns[imp[i][0]], '\n Score: %.5f' % imp[i][1])

# plot feature importance
plt.figure(figsize=(8,6))
plt.bar([x for x in range(len(importance))], importance)
plt.title('Feature Importance')
plt.xlabel('Features')
plt.ylabel('Score')
plt.show()

In [None]:
# Dropping unimportant features
drop_cols = X_train.columns[np.where(importance == 0)]
X_encoded_e.drop(drop_cols, axis=1, inplace=True)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
roc_auc = roc_auc_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test)
probabilities = y_prob[:,1]

fpr_e, tpr_e, _ = roc_curve(y_test, probabilities)
roc_display_e = RocCurveDisplay(fpr=fpr_e, tpr=tpr_e)
roc_auc_e = auc(fpr_e, tpr_e)

prec_e, recall_e, _ = precision_recall_curve(y_test, probabilities)
pr_auc_e = auc(recall_e, prec_e)
pr_display_e = PrecisionRecallDisplay(precision=prec_e, recall=recall_e)

print('ROC AUC:', roc_auc_e)
print('Precision-Recall AUC:', pr_auc_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_e.plot(ax=ax1)
pr_display_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test, y_test, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                        gamma=1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_train, y_train)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


## Splitting the Dataset into Training and Testing

In [None]:
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_encoded_p, y_p, test_size=0.2, 
                                                    stratify = y_p, random_state=42)

X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_encoded_e, y_e, test_size=0.2, 
                                                    stratify = y_e, random_state=42)

In [None]:
print(X_encoded_p.columns)

## Random Over Sampler

In [None]:
ros = RandomOverSampler(sampling_strategy = 'minority', random_state=42)

# Primary business
X_train_res_p, y_train_res_p = ros.fit_resample(X_train_p, y_train_p)

# Excess business
X_train_res_e, y_train_res_e = ros.fit_resample(X_train_e, y_train_e)

### XGBoost (ROS)

#### Primary business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_train_res_p, y_train_res_p, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_train_res_p, y_train_res_p, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_p)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_p, predictions)
report = classification_report(y_test_p, predictions)
roc_auc = roc_auc_score(y_test_p, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_p)
probabilities = y_prob[:,1]

fpr_ros_p, tpr_ros_p, _ = roc_curve(y_test_p, probabilities)
roc_display_ros_p = RocCurveDisplay(fpr=fpr_ros_p, tpr=tpr_ros_p)
roc_auc_ros_p = auc(fpr_ros_p, tpr_ros_p)

prec_ros_p, recall_ros_p, _ = precision_recall_curve(y_test_p, probabilities)
pr_auc_ros_p = auc(recall_ros_p, prec_ros_p)
pr_display_ros_p = PrecisionRecallDisplay(precision=prec_ros_p, recall=recall_ros_p)

print('ROC AUC:', roc_auc_ros_p)
print('Precision-Recall AUC:', pr_auc_ros_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_ros_p.plot(ax=ax1)
pr_display_ros_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_p, y_test_p, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=0,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_train_res_p, y_train_res_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_train_res_e, y_train_res_e, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5
                           )
clf_xgb.fit(X_train_res_e, y_train_res_e, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_e)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_e, predictions)
report = classification_report(y_test_e, predictions)
roc_auc = roc_auc_score(y_test_e, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_e)
probabilities = y_prob[:,1]

fpr_ros_e, tpr_ros_e, _ = roc_curve(y_test_e, probabilities)
roc_display_ros_e = RocCurveDisplay(fpr=fpr_ros_e, tpr=tpr_ros_e)
roc_auc_ros_e = auc(fpr_ros_e, tpr_ros_e)

prec_ros_e, recall_ros_e, _ = precision_recall_curve(y_test_e, probabilities)
pr_auc_ros_e = auc(recall_ros_e, prec_ros_e)
pr_display_ros_e = PrecisionRecallDisplay(precision=prec_ros_e, recall=recall_ros_e)

print('ROC AUC:', roc_auc_ros_e)
print('Precision-Recall AUC:', pr_auc_ros_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_ros_e.plot(ax=ax1)
pr_display_ros_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_e, y_test_e, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_train_res_e, y_train_res_e)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


## SMOTE

In [None]:
sm = SMOTE(sampling_strategy = 'minority', random_state=42)

# Primary business
X_train_sm_p, y_train_sm_p = sm.fit_resample(X_train_p, y_train_p)

# Excess business
X_train_sm_e, y_train_sm_e = sm.fit_resample(X_train_e, y_train_e)

### XGBoost (SMOTE)

#### Primary business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_train_sm_p, y_train_sm_p, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_train_sm_p, y_train_sm_p, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_p)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_p, predictions)
report = classification_report(y_test_p, predictions)
roc_auc = roc_auc_score(y_test_p, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_p)
probabilities = y_prob[:,1]

fpr_sm_p, tpr_sm_p, _ = roc_curve(y_test_p, probabilities)
roc_display_sm_p = RocCurveDisplay(fpr=fpr_sm_p, tpr=tpr_sm_p)
roc_auc_sm_p = auc(fpr_sm_p, tpr_sm_p)

prec_sm_p, recall_sm_p, _ = precision_recall_curve(y_test_p, probabilities)
pr_auc_sm_p = auc(recall_sm_p, prec_sm_p)
pr_display_sm_p = PrecisionRecallDisplay(precision=prec_sm_p, recall=recall_sm_p)

print('ROC AUC:', roc_auc_sm_p)
print('Precision-Recall AUC:', pr_auc_sm_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_sm_p.plot(ax=ax1)
pr_display_sm_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_p, y_test_p, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=0.1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_train_sm_p, y_train_sm_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_train_sm_e, y_train_sm_e, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.05,
                          max_depth=5,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5
                           )
clf_xgb.fit(X_train_sm_e, y_train_sm_e, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_e)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_e, predictions)
report = classification_report(y_test_e, predictions)
roc_auc = roc_auc_score(y_test_e, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_e)
probabilities = y_prob[:,1]

fpr_sm_e, tpr_sm_e, _ = roc_curve(y_test_e, probabilities)
roc_display_sm_e = RocCurveDisplay(fpr=fpr_sm_e, tpr=tpr_sm_e)
roc_auc_sm_e = auc(fpr_sm_e, tpr_sm_e)

prec_sm_e, recall_sm_e, _ = precision_recall_curve(y_test_e, probabilities)
pr_auc_sm_e = auc(recall_sm_e, prec_sm_e)
pr_display_sm_e = PrecisionRecallDisplay(precision=prec_sm_e, recall=recall_sm_e)

print('ROC AUC:', roc_auc_sm_e)
print('Precision-Recall AUC:', pr_auc_sm_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_sm_e.plot(ax=ax1)
pr_display_sm_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_e, y_test_e, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.05,
                          max_depth=5,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_train_sm_p, y_train_sm_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


## GAN

In [None]:
def generator_network(x, data_dim, base_n_count): 
    x = layers.Dense(base_n_count, activation='relu')(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x)
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    x = layers.Dense(data_dim)(x)    
    return x

def discriminator_network(x, data_dim, base_n_count):
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    # x = layers.Dropout(0.1)(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x)
    # x = layers.Dropout(0.1)(x)
    x = layers.Dense(base_n_count, activation='relu')(x)
    x = layers.Dense(1, activation='sigmoid')(x)
    # x = layers.Dense(1)(x)
    return x

In [None]:
def generator_network_w_label(x, labels, data_dim, label_dim, base_n_count): 
    x = layers.concatenate([x,labels])
    x = layers.Dense(base_n_count*1, activation='relu')(x) # 1
    x = layers.Dense(base_n_count*2, activation='relu')(x) # 2
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    # x = layers.Dense(base_n_count*4, activation='relu')(x) # extra
    # x = layers.Dense(base_n_count*4, activation='relu')(x) # extra
    x = layers.Dense(data_dim)(x)    
    x = layers.concatenate([x,labels])
    return x

In [None]:
def get_data_batch(train, batch_size, seed=42):
    # # random sampling - some samples will have excessively low or high sampling, but easy to implement
    # np.random.seed(seed)
    # x = train.loc[ np.random.choice(train.index, batch_size) ].values
    
    # iterate through shuffled indices, so every sample gets covered evenly
    start_i = (batch_size * seed) % len(train)
    stop_i = start_i + batch_size
    shuffle_seed = (batch_size * seed) // len(train)
    np.random.seed(shuffle_seed)
    train_ix = np.random.choice( list(train.index), replace=False, size=len(train) ) # wasteful to shuffle every time
    train_ix = list(train_ix) + list(train_ix) # duplicate to cover ranges past the end of the set
    x = train.loc[ train_ix[ start_i: stop_i ] ].values
    
    return np.reshape(x, (batch_size, -1) )

In [None]:
def CheckAccuracy( x, g_z, data_cols, label_cols=[], seed=42, with_class=False, data_dim=2 ):

    # Slightly slower code to create dataframes to feed into the xgboost dmatrix formats
    
    # real_samples = pd.DataFrame(x, columns=data_cols+label_cols)
    # gen_samples = pd.DataFrame(g_z, columns=data_cols+label_cols)
    # real_samples['syn_label'] = 0
    # gen_samples['syn_label'] = 1
    
    # training_fraction = 0.5
    # n_real, n_gen = int(len(real_samples)*training_fraction), int(len(gen_samples)*training_fraction)
    # train_df = pd.concat([real_samples[:n_real],gen_samples[:n_gen]],axis=0)
    # test_df = pd.concat([real_samples[n_real:],gen_samples[n_gen:]],axis=0)

    # X_col = test_df.columns[:-1]
    # y_col = test_df.columns[-1]
    # dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
    # dtest = xgb.DMatrix(test_df[X_col], feature_names=X_col)
    # y_true = test_df['syn_label']

    dtrain = np.vstack( [ x[:int(len(x)/2)], g_z[:int(len(g_z)/2)] ] ) # Use half of each real and generated set for training
    dlabels = np.hstack( [ np.zeros(int(len(x)/2)), np.ones(int(len(g_z)/2)) ] ) # synthetic labels
    dtest = np.vstack( [ x[int(len(x)/2):], g_z[int(len(g_z)/2):] ] ) # Use the other half of each set for testing
    y_true = dlabels # Labels for test samples will be the same as the labels for training samples, assuming even batch sizes
    
    dtrain = xgb.DMatrix(dtrain, dlabels, feature_names=data_cols)
    dtest = xgb.DMatrix(dtest, feature_names=data_cols)
    
    xgb_params = {
        # 'tree_method': 'hist', # for faster evaluation
        'max_depth': 6,
        'objective': 'binary:logistic',
        'random_state': 0,
        'eval_metric': 'aucpr',
#         'gamma' : 0,
#         'learning_rate': 0.1,
#         'reg_lambda' : 10,
        'scale_pos_weight' : 1 # allows for balanced or unbalanced classes 
        }
    xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100) # limit to ten rounds for faster evaluation

    y_pred = xgb_test.predict(dtest)

    y_pred = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_true, y_pred)
    # return '{:.2f}'.format(SimpleAccuracy(y_pred, y_true)) # assumes balanced real and generated datasets
    return accuracy # assumes balanced real and generated datasets

In [None]:
def CheckAccuracyConditional( x, g_z, data_cols, label_cols=[], seed=42, with_class=False, data_dim=2 ):

    # Slightly slower code to create dataframes to feed into the xgboost dmatrix formats
    
    # real_samples = pd.DataFrame(x, columns=data_cols+label_cols)
    # gen_samples = pd.DataFrame(g_z, columns=data_cols+label_cols)
    # real_samples['syn_label'] = 0
    # gen_samples['syn_label'] = 1
    
    # training_fraction = 0.5
    # n_real, n_gen = int(len(real_samples)*training_fraction), int(len(gen_samples)*training_fraction)
    # train_df = pd.concat([real_samples[:n_real],gen_samples[:n_gen]],axis=0)
    # test_df = pd.concat([real_samples[n_real:],gen_samples[n_gen:]],axis=0)

    # X_col = test_df.columns[:-1]
    # y_col = test_df.columns[-1]
    # dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
    # dtest = xgb.DMatrix(test_df[X_col], feature_names=X_col)
    # y_true = test_df['syn_label']

    dtrain = np.vstack( [ x[:int(len(x)/2)], g_z[:int(len(g_z)/2)] ] ) # Use half of each real and generated set for training
    dlabels = np.hstack( [ np.zeros(int(len(x)/2)), np.ones(int(len(g_z)/2)) ] ) # synthetic labels
    dtest = np.vstack( [ x[int(len(x)/2):], g_z[int(len(g_z)/2):] ] ) # Use the other half of each set for testing
    y_true = dlabels # Labels for test samples will be the same as the labels for training samples, assuming even batch sizes
    
    dtrain = xgb.DMatrix(dtrain, dlabels, feature_names=data_cols_w_class)
    dtest = xgb.DMatrix(dtest, feature_names=data_cols_w_class)
    
    xgb_params = {
        # 'tree_method': 'hist', # for faster evaluation
        'max_depth': 6,
        'objective': 'binary:logistic',
        'random_state': 0,
        'eval_metric': 'aucpr',
#         'gamma' : 0,
#         'learning_rate': 0.1,
#         'reg_lambda' : 10,
        'scale_pos_weight' : 1 # allows for balanced or unbalanced classes 
        }
    xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100) # limit to ten rounds for faster evaluation

    y_pred = xgb_test.predict(dtest)

    y_pred = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_true, y_pred)
    # return '{:.2f}'.format(SimpleAccuracy(y_pred, y_true)) # assumes balanced real and generated datasets
    return accuracy # assumes balanced real and generated datasets

#### Primary Business

In [None]:
X_encoded_w_classes_p = X_train_p.copy()
X_encoded_w_classes_p['Class'] = y_train_p
X_encoded_w_classes_p.head()

In [None]:
# Finding rows with class '1'
train_p = X_encoded_w_classes_p.loc[ X_encoded_w_classes_p.Class == 1 ].copy()
claim_w_classes_p = train_p.copy()
claim_w_classes_p['Class'] = y_train_p
# claim_w_classes_p.head()
train_p = claim_w_classes_p.copy().reset_index(drop=True)
# train_p.head()

label_cols = ['Class']
data_cols = [ i for i in train_p.columns if i not in label_cols ]
train_no_label_p = train_p.copy()
train_no_label_p.drop('Class', axis=1, inplace=True)
# train_no_label_p = train_p[ data_cols ]
train_no_label_p.head()

In [None]:
# rand_dim = len(train_no_label_p.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 32 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 5e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# with_class=False
# data_cols = train_no_label_p.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)
    
# label_dim = 0

# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'GAN'

# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# generated_image_tensor = generator_network(generator_input_tensor, data_dim, base_n_count)

# generated_or_real_image_tensor = layers.Input(shape=(data_dim,))
    
# discriminator_output = discriminator_network(generated_or_real_image_tensor, data_dim, base_n_count)

# generator_model = models.Model(inputs=[generator_input_tensor], outputs=[generated_image_tensor], name='generator')
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                    outputs=[discriminator_output],
#                                    name='discriminator')

# combined_output = discriminator_model(generator_model(generator_input_tensor))
# combined_model = models.Model(inputs=[generator_input_tensor], outputs=[combined_output], name='combined')

In [None]:
# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

In [None]:
# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# generator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss='binary_crossentropy')

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1)
    
#     # train the discriminator
#     for j in range(k_d):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         x = get_data_batch(train_no_label_p, batch_size, seed=i+j)
            
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
# #             x = np.vstack([x,g_z]) # code to train the discriminator on real and generated data at the same time, but you have to run network again for separate losses
# #             classes = np.hstack([np.zeros(batch_size),np.ones(batch_size)])
# #             d_l_r = discriminator_model.train_on_batch(x, classes)
            
#         d_l_r = discriminator_model.train_on_batch(x, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         d_l_g = discriminator_model.train_on_batch(g_z, np.random.uniform(low=0.0, high=0.001, size=batch_size)) # 0.0, 0.3 # GANs need noise to prevent loss going to zero
#             # d_l_r = discriminator_model.train_on_batch(x, np.ones(batch_size)) # without noise
#             # d_l_g = discriminator_model.train_on_batch(g_z, np.zeros(batch_size)) # without noise
#     disc_loss_real.append(d_l_r)
#     disc_loss_generated.append(d_l_g)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             # loss = combined_model.train_on_batch([z, labels], np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch([z, labels], np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         else:
#             # loss = combined_model.train_on_batch(z, np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch(z, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_p['Class']==1) - 1 # test using all of the actual fraud data - 1
#         x = get_data_batch(train_no_label_p, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracy( x, g_z, data_cols, label_cols, seed=42, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
    
#     # Saving weights and plotting images
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries      
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )    

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_p) - len(X_train_p)
# new_z = np.random.normal(size=(samples,rand_dim))
# new_g_z = generator_model.predict(new_z)

In [None]:
# gan_samples_p = pd.DataFrame(new_g_z, columns=data_cols)
# gan_samples_p.to_csv('dandoww_gan_primary.csv',index=False)

In [None]:
gan_samples_p = pd.read_csv('../input/project/dandoww_gan_primary.csv')
gan_samples_p.head()

In [None]:
X_gan_p = X_train_p.append(gan_samples_p, ignore_index = True)
X_gan_p.describe()

In [None]:
X_train_p.describe()

In [None]:
ones = np.ones((len(gan_samples_p),))
one = pd.Series(ones)
y_gan_p = y_train_p.append(one, ignore_index=True)
# y_gan_p

#### Excess Business

In [None]:
X_encoded_w_classes_e = X_train_e.copy()
X_encoded_w_classes_e['Class'] = y_train_e
X_encoded_w_classes_e.head()

In [None]:
# Finding rows with class '1'
train_e = X_encoded_w_classes_e.loc[ X_encoded_w_classes_e.Class == 1 ].copy()
claim_w_classes_e = train_e.copy()
claim_w_classes_e['Class'] = y_train_e
# claim_w_classes_e.head()
train_e = claim_w_classes_e.copy().reset_index(drop=True)
# train_e.head()

label_cols = ['Class']
data_cols = [ i for i in train_e.columns if i not in label_cols ]
train_no_label_e = train_e.copy()
train_no_label_e.drop('Class', axis=1, inplace=True)
# train_no_label_e = train_e[ data_cols ]
train_no_label_e.head()

In [None]:
# rand_dim = len(train_no_label_e.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 128 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 5e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# with_class=False
# data_cols = train_no_label_e.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)
    
# label_dim = 0

# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'GAN'

# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# generated_image_tensor = generator_network(generator_input_tensor, data_dim, base_n_count)

# generated_or_real_image_tensor = layers.Input(shape=(data_dim,))
    
# discriminator_output = discriminator_network(generated_or_real_image_tensor, data_dim, base_n_count)

# generator_model = models.Model(inputs=[generator_input_tensor], outputs=[generated_image_tensor], name='generator')
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                    outputs=[discriminator_output],
#                                    name='discriminator')

# combined_output = discriminator_model(generator_model(generator_input_tensor))
# combined_model = models.Model(inputs=[generator_input_tensor], outputs=[combined_output], name='combined')

In [None]:
# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

In [None]:
# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# generator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss='binary_crossentropy')

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1)
    
#     # train the discriminator
#     for j in range(k_d):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         x = get_data_batch(train_no_label_e, batch_size, seed=i+j)
            
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
# #             x = np.vstack([x,g_z]) # code to train the discriminator on real and generated data at the same time, but you have to run network again for separate losses
# #             classes = np.hstack([np.zeros(batch_size),np.ones(batch_size)])
# #             d_l_r = discriminator_model.train_on_batch(x, classes)
            
#         d_l_r = discriminator_model.train_on_batch(x, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         d_l_g = discriminator_model.train_on_batch(g_z, np.random.uniform(low=0.0, high=0.001, size=batch_size)) # 0.0, 0.3 # GANs need noise to prevent loss going to zero
#             # d_l_r = discriminator_model.train_on_batch(x, np.ones(batch_size)) # without noise
#             # d_l_g = discriminator_model.train_on_batch(g_z, np.zeros(batch_size)) # without noise
#     disc_loss_real.append(d_l_r)
#     disc_loss_generated.append(d_l_g)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             # loss = combined_model.train_on_batch([z, labels], np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch([z, labels], np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         else:
#             # loss = combined_model.train_on_batch(z, np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch(z, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_e['Class']==1)  # test using all of the actual fraud data 
#         x = get_data_batch(train_no_label_e, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracy( x, g_z, data_cols, label_cols, seed=42, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
    
#     # Saving weights and plotting images
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries      
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )    

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_e) - len(X_train_e)
# new_z = np.random.normal(size=(samples,rand_dim))
# new_g_z = generator_model.predict(new_z)

In [None]:
# gan_samples_e = pd.DataFrame(new_g_z, columns=data_cols)
# gan_samples_e.to_csv('dandoww_gan_excess.csv',index=False)

In [None]:
gan_samples_e = pd.read_csv('../input/project/dandoww_gan_excess.csv')
gan_samples_e.head()

In [None]:
X_gan_e = X_train_e.append(gan_samples_e, ignore_index = True)
X_gan_e.describe()

In [None]:
X_train_e.describe()

In [None]:
ones = np.ones((len(gan_samples_e),))
one = pd.Series(ones)
y_gan_e = y_train_e.append(one, ignore_index=True)
# y_gan

### XGBoost (GAN)

#### Primary Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_gan_p, y_gan_p, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0,
                          learning_rate=0.01,
                          max_depth=4,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_gan_p, y_gan_p, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_p)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_p, predictions)
report = classification_report(y_test_p, predictions)
roc_auc = roc_auc_score(y_test_p, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_p)
probabilities = y_prob[:,1]

fpr_gan_p, tpr_gan_p, _ = roc_curve(y_test_p, probabilities)
roc_display_gan_p = RocCurveDisplay(fpr=fpr_gan_p, tpr=tpr_gan_p)
roc_auc_gan_p = auc(fpr_gan_p, tpr_gan_p)

prec_gan_p, recall_gan_p, _ = precision_recall_curve(y_test_p, probabilities)
pr_auc_gan_p = auc(recall_gan_p, prec_gan_p)
pr_display_gan_p = PrecisionRecallDisplay(precision=prec_gan_p, recall=recall_gan_p)

print('ROC AUC:', roc_auc_gan_p)
print('Precision-Recall AUC:', pr_auc_gan_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_gan_p.plot(ax=ax1)
pr_display_gan_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_p, y_test_p, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                           gamma=0,
                          learning_rate=0.01,
                          max_depth=4,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_gan_p, y_gan_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_gan_e, y_gan_e, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=1,
                          learning_rate=0.1,
                          max_depth=5,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_gan_e, y_gan_e, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_e)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_e, predictions)
report = classification_report(y_test_e, predictions)
roc_auc = roc_auc_score(y_test_e, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_e)
probabilities = y_prob[:,1]

fpr_gan_e, tpr_gan_e, _ = roc_curve(y_test_e, probabilities)
roc_display_gan_e = RocCurveDisplay(fpr=fpr_gan_e, tpr=tpr_gan_e)
roc_auc_gan_e = auc(fpr_gan_e, tpr_gan_e)

prec_gan_e, recall_gan_e, _ = precision_recall_curve(y_test_e, probabilities)
pr_display_gan_e = PrecisionRecallDisplay(precision=prec_gan_e, recall=recall_gan_e)
pr_auc_gan_e = auc(recall_gan_e, prec_gan_e)

print('ROC AUC:', roc_auc_gan_e)
print('Precision-Recall AUC:', pr_auc_gan_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_gan_e.plot(ax=ax1)
pr_display_gan_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_e, y_test_e, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=1,
                          learning_rate=0.1,
                          max_depth=5,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           n_estimators=1
                           )
clf_xgb.fit(X_gan_e, y_gan_e)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


## cGAN

#### Primary Business

In [None]:
# rand_dim = len(train_no_label_p.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 32 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 5e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# with_class = True
# data_cols = train_no_label_p.columns
# data_cols_w_class = train_p.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)

# label_dim = 0
# label_dim = len(label_cols)
# print('label_dim: ', label_dim)
# print('label_cols: ', label_cols)
    

# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'CGAN'

# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# labels_tensor = layers.Input(shape=(label_dim,)) # updated for class
# generated_image_tensor = generator_network_w_label(generator_input_tensor, labels_tensor, data_dim, label_dim, base_n_count) # updated for class

# generated_or_real_image_tensor = layers.Input(shape=(data_dim + label_dim,)) # updated for class
    
# discriminator_output = discriminator_network(generated_or_real_image_tensor, data_dim + label_dim, base_n_count)

# generator_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[generated_image_tensor], name='generator') # updated for class
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                    outputs=[discriminator_output],
#                                    name='discriminator')

# combined_output = discriminator_model(generator_model([generator_input_tensor, labels_tensor])) # updated for class
# combined_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[combined_output], name='combined') # updated for class

In [None]:
# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# generator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss='binary_crossentropy')

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1)
    
#     # train the discriminator
#     for j in range(k_d):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         x = get_data_batch(train_p, batch_size, seed=i+j)
            
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
# #             print(g_z)
#         else:
#             g_z = generator_model.predict(z)
# #             x = np.vstack([x,g_z]) # code to train the discriminator on real and generated data at the same time, but you have to run network again for separate losses
# #             classes = np.hstack([np.zeros(batch_size),np.ones(batch_size)])
# #             d_l_r = discriminator_model.train_on_batch(x, classes)
            
#         d_l_r = discriminator_model.train_on_batch(x, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         d_l_g = discriminator_model.train_on_batch(g_z, np.random.uniform(low=0.0, high=0.001, size=batch_size)) # 0.0, 0.3 # GANs need noise to prevent loss going to zero
# #             # d_l_r = discriminator_model.train_on_batch(x, np.ones(batch_size)) # without noise
# #             # d_l_g = discriminator_model.train_on_batch(g_z, np.zeros(batch_size)) # without noise
#     disc_loss_real.append(d_l_r)
#     disc_loss_generated.append(d_l_g)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             # loss = combined_model.train_on_batch([z, labels], np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch([z, labels], np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         else:
#             # loss = combined_model.train_on_batch(z, np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch(z, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_p['Class']==1) - 1 # test using all of the actual fraud data - 1
#         x = get_data_batch(train_p, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracyConditional( x, g_z, data_cols, label_cols, seed=42, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
    
#     # Saving weights and plotting images
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries      
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )    

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_p) - len(X_train_p)
# new_z = np.random.normal(size=(samples,rand_dim))
# labels_z = np.ones((samples,1))
# new_g_z = generator_model.predict([new_z, labels_z])

In [None]:
# cgan_samples_p = pd.DataFrame(new_g_z, columns=train_p.columns)
# # Saving the dataset
# cgan_samples_p.to_csv('dandoww_cgan_primary.csv',index=False)

In [None]:
# Loading the dataset
cgan_samples_p = pd.read_csv('../input/project/dandoww_cgan_primary.csv')
cgan_samples_p.head()

In [None]:
cgan_samples_p.drop('Class', axis=1, inplace=True)
cgan_samples_p.head()

In [None]:
X_cgan_p = X_train_p.append(cgan_samples_p, ignore_index = True)
X_cgan_p.describe()

In [None]:
X_train_p.describe()

In [None]:
ones = np.ones((len(cgan_samples_p),))
one = pd.Series(ones)
y_cgan_p = y_train_p.append(one, ignore_index=True)
# y_cgan_p

#### Excess Business

In [None]:
# rand_dim = len(train_no_label_e.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 128 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 5e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# with_class = True
# data_cols = train_no_label_e.columns
# data_cols_w_class = train_e.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)

# label_dim = 0
# label_dim = len(label_cols)
# print('label_dim: ', label_dim)
# print('label_cols: ', label_cols)
    

# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'CGAN'

# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# labels_tensor = layers.Input(shape=(label_dim,)) # updated for class
# generated_image_tensor = generator_network_w_label(generator_input_tensor, labels_tensor, data_dim, label_dim, base_n_count) # updated for class

# generated_or_real_image_tensor = layers.Input(shape=(data_dim + label_dim,)) # updated for class
    
# discriminator_output = discriminator_network(generated_or_real_image_tensor, data_dim + label_dim, base_n_count)

# generator_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[generated_image_tensor], name='generator') # updated for class
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                    outputs=[discriminator_output],
#                                    name='discriminator')

# combined_output = discriminator_model(generator_model([generator_input_tensor, labels_tensor])) # updated for class
# combined_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[combined_output], name='combined') # updated for class

In [None]:
# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# generator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.compile(optimizer=adam, loss='binary_crossentropy')
# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss='binary_crossentropy')

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1)
    
#     # train the discriminator
#     for j in range(k_d):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         x = get_data_batch(train_e, batch_size, seed=i+j)
            
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
# #             print(g_z)
#         else:
#             g_z = generator_model.predict(z)
# #             x = np.vstack([x,g_z]) # code to train the discriminator on real and generated data at the same time, but you have to run network again for separate losses
# #             classes = np.hstack([np.zeros(batch_size),np.ones(batch_size)])
# #             d_l_r = discriminator_model.train_on_batch(x, classes)
            
#         d_l_r = discriminator_model.train_on_batch(x, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         d_l_g = discriminator_model.train_on_batch(g_z, np.random.uniform(low=0.0, high=0.001, size=batch_size)) # 0.0, 0.3 # GANs need noise to prevent loss going to zero
# #             # d_l_r = discriminator_model.train_on_batch(x, np.ones(batch_size)) # without noise
# #             # d_l_g = discriminator_model.train_on_batch(g_z, np.zeros(batch_size)) # without noise
#     disc_loss_real.append(d_l_r)
#     disc_loss_generated.append(d_l_g)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             # loss = combined_model.train_on_batch([z, labels], np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch([z, labels], np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#         else:
#             # loss = combined_model.train_on_batch(z, np.ones(batch_size)) # without noise
#             loss = combined_model.train_on_batch(z, np.random.uniform(low=0.999, high=1.0, size=batch_size)) # 0.7, 1.2 # GANs need noise to prevent loss going to zero
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_e['Class']==1) # test using all of the actual fraud data
#         x = get_data_batch(train_e, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracyConditional( x, g_z, data_cols, label_cols, seed=42, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
    
#     # Saving weights and plotting images
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries      
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )    

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_e) - len(X_train_e)
# new_z = np.random.normal(size=(samples,rand_dim))
# labels_z = np.ones((samples,1))
# new_g_z = generator_model.predict([new_z, labels_z])

In [None]:
# cgan_samples_e = pd.DataFrame(new_g_z, columns=train_e.columns)
# # Saving the dataset
# cgan_samples_e.to_csv('dandoww_cgan_excess.csv',index=False)

In [None]:
# Loading the dataset
cgan_samples_e = pd.read_csv('../input/project/dandoww_cgan_excess.csv')
cgan_samples_e.head()

In [None]:
cgan_samples_e.drop('Class', axis=1, inplace=True)
cgan_samples_e.head()

In [None]:
X_cgan_e = X_train_e.append(cgan_samples_e, ignore_index = True)
X_cgan_e.describe()

In [None]:
X_train_e.describe()

In [None]:
ones = np.ones((len(cgan_samples_e),))
one = pd.Series(ones)
y_cgan_e = y_train_e.append(one, ignore_index=True)
# y_cgan_e

### XGBoost (cGAN)

#### Primary Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_cgan_p, y_cgan_p, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0,
                          learning_rate=0.01,
                          max_depth=4,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_cgan_p, y_cgan_p, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_p)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_p, predictions)
report = classification_report(y_test_p, predictions)
roc_auc = roc_auc_score(y_test_p, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_p)
probabilities = y_prob[:,1]

fpr_cgan_p, tpr_cgan_p, _ = roc_curve(y_test_p, probabilities)
roc_display_cgan_p = RocCurveDisplay(fpr=fpr_cgan_p, tpr=tpr_cgan_p)
roc_auc_cgan_p = auc(fpr_cgan_p, tpr_cgan_p)

prec_cgan_p, recall_cgan_p, _ = precision_recall_curve(y_test_p, probabilities)
pr_display_cgan_p = PrecisionRecallDisplay(precision=prec_cgan_p, recall=recall_cgan_p)
pr_auc_cgan_p = auc(recall_cgan_p, prec_cgan_p)

print('ROC AUC:', roc_auc_cgan_p)
print('Precision-Recall AUC:', pr_auc_cgan_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_cgan_p.plot(ax=ax1)
pr_display_cgan_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_p, y_test_p, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0,
                          learning_rate=0.01,
                          max_depth=4,
                          reg_lambda=1,
                          scale_pos_weight=1,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_cgan_p, y_cgan_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_cgan_e, y_cgan_e, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=10,
                          scale_pos_weight=2,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_cgan_e, y_cgan_e, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_e)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_e, predictions)
report = classification_report(y_test_e, predictions)
roc_auc = roc_auc_score(y_test_e, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_e)
probabilities = y_prob[:,1]

fpr_cgan_e, tpr_cgan_e, _ = roc_curve(y_test_e, probabilities)
roc_display_cgan_e = RocCurveDisplay(fpr=fpr_cgan_e, tpr=tpr_cgan_e)
roc_auc_cgan_e = auc(fpr_cgan_e, tpr_cgan_e)

prec_cgan_e, recall_cgan_e, _ = precision_recall_curve(y_test_e, probabilities)
pr_display_cgan_e = PrecisionRecallDisplay(precision=prec_cgan_e, recall=recall_cgan_e)
pr_auc_cgan_e = auc(recall_cgan_e, prec_cgan_e)

print('ROC AUC:', roc_auc_cgan_e)
print('Precision-Recall AUC:', pr_auc_cgan_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_cgan_e.plot(ax=ax1)
pr_display_cgan_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_e, y_test_e, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=10,
                          scale_pos_weight=2,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_cgan_e, y_cgan_e)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


## WGAN

In [None]:
def critic_network(x, data_dim, base_n_count):
    x = layers.Dense(base_n_count*4, activation='relu')(x)
    # x = layers.Dropout(0.1)(x)
    x = layers.Dense(base_n_count*2, activation='relu')(x) # 2
    # x = layers.Dropout(0.1)(x)
    x = layers.Dense(base_n_count*1, activation='relu')(x) # 1
    # x = layers.Dense(base_n_count*4, activation='relu')(x) # extra
    # x = layers.Dense(base_n_count*4, activation='relu')(x) # extra
    # x = layers.Dense(1, activation='sigmoid')(x)
    x = layers.Dense(1)(x)
    return x

In [None]:
def em_loss(y_coefficients, y_pred):
    # define earth mover distance (wasserstein loss)
    # literally the weighted average of the critic network output
    # this is defined separately so it can be fed as a loss function to the optimizer in the WGANs
    return tf.reduce_mean(tf.multiply(y_coefficients, y_pred))

In [None]:
def train_discriminator_step_p(step, seed=42):
    
    if step == 0:
        init = tf.global_variables_initializer()
    
    if with_class:
        if step == 0:
            sess.run(init)
        d_l_g, d_l_r, _ = sess.run([_disc_loss_generated, _disc_loss_real, disc_optimizer], feed_dict={
            _z: np.random.normal(size=(batch_size, rand_dim)),
            _x: get_data_batch(train_p, batch_size, seed=seed),
            _labels: get_data_batch(train_p, batch_size, seed=seed)[:,-label_dim:], # .reshape(-1,label_dim), # updated for class            
            epsilon: np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1))
        })
    else:
        if step == 0:
            sess.run(init)
        d_l_g, d_l_r, _ = sess.run([_disc_loss_generated, _disc_loss_real, disc_optimizer], feed_dict={
            _z: np.random.normal(size=(batch_size, rand_dim)),
            _x: get_data_batch(train_no_label_p, batch_size, seed=seed),
            epsilon: np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1))
        })
        
    return d_l_g, d_l_r

In [None]:
def train_discriminator_step_e(step, seed=42):
    
    if step == 0:
        init = tf.global_variables_initializer()
    
    if with_class:
        if step == 0:
            sess.run(init)
        d_l_g, d_l_r, _ = sess.run([_disc_loss_generated, _disc_loss_real, disc_optimizer], feed_dict={
            _z: np.random.normal(size=(batch_size, rand_dim)),
            _x: get_data_batch(train_e, batch_size, seed=seed),
            _labels: get_data_batch(train_e, batch_size, seed=seed)[:,-label_dim:], # .reshape(-1,label_dim), # updated for class            
            epsilon: np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1))
        })
    else:
        if step == 0:
            sess.run(init)
        d_l_g, d_l_r, _ = sess.run([_disc_loss_generated, _disc_loss_real, disc_optimizer], feed_dict={
            _z: np.random.normal(size=(batch_size, rand_dim)),
            _x: get_data_batch(train_no_label_e, batch_size, seed=seed),
            epsilon: np.random.uniform(low=0.0, high=1.0, size=(batch_size, 1))
        })
        
    return d_l_g, d_l_r

#### Primary Business

In [None]:
X_encoded_w_classes_p = X_train_p.copy()
X_encoded_w_classes_p['Class'] = y_train_p
X_encoded_w_classes_p.head()

In [None]:
# Finding rows with class '1'
train_p = X_encoded_w_classes_p.loc[ X_encoded_w_classes_p.Class == 1 ].copy()
claim_w_classes_p = train_p.copy()
claim_w_classes_p['Class'] = y_train_p
# claim_w_classes_p.head()
train_p = claim_w_classes_p.copy().reset_index(drop=True)
# train_p.head()

label_cols = ['Class']
data_cols = [ i for i in train_p.columns if i not in label_cols ]
train_no_label_p = train_p.copy()
train_no_label_p.drop('Class', axis=1, inplace=True)
# train_no_label_p = train_p[ data_cols ]
train_no_label_p.head()

In [None]:
# rand_dim = len(train_no_label_p.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 32 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 1e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# data_cols = train_no_label_p.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)
    
# label_dim = 0
# with_class = False
# label_cols = []
    
# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'WGAN'
# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# generated_image_tensor = generator_network(generator_input_tensor, data_dim, base_n_count)

# generated_or_real_image_tensor = layers.Input(shape=(data_dim,))
# discriminator_output = critic_network(generated_or_real_image_tensor, data_dim, base_n_count)

# generator_model = models.Model(inputs=[generator_input_tensor], outputs=[generated_image_tensor], name='generator')
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                        outputs=[discriminator_output],
#                                        name='discriminator')

# combined_output = discriminator_model(generator_model(generator_input_tensor))
# combined_model = models.Model(inputs=[generator_input_tensor], outputs=[combined_output], name='combined')

In [None]:
# _z = tf.placeholder(tf.float32, shape=(batch_size, rand_dim))
    
# _labels = None    

# _x = tf.placeholder(tf.float32, shape=(batch_size, data_dim))
# _g_z = generator_model(_z)

# epsilon = tf.placeholder(tf.float32, shape=(batch_size, 1))
    
# x_hat = epsilon * _x + (1.0 - epsilon) * _g_z
# gradients = tf.gradients(discriminator_model(x_hat), [x_hat])
# _gradient_penalty = 10.0 * tf.square(tf.norm(gradients[0], ord=2) - 1.0)

# # calculate discriminator's loss
# _disc_loss_generated = em_loss(tf.ones(batch_size), discriminator_model(_g_z))
# _disc_loss_real = em_loss(tf.ones(batch_size), discriminator_model(_x))
# _disc_loss = _disc_loss_generated - _disc_loss_real + _gradient_penalty

# # update f by taking an SGD step on mini-batch loss LD(f)
# disc_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9).minimize(_disc_loss, var_list=discriminator_model.trainable_weights)

# sess = K.get_session()

# # compile models

# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss=[em_loss])

# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# print('pre-training the critic...')
# K.set_learning_phase(1) # 1 = train
# for i in range(critic_pre_train_steps):
#     if i%20==0:
#         print('Step: {} of {} critic pre-training.'.format(i, critic_pre_train_steps))
#         loss = train_discriminator_step_p(step=0,seed=i)

# print('Last batch of critic pre-training disc_loss: {}.'.format(loss))

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1) # 1 = train
    
#     # train the discriminator
#     for j in range(k_d):
#         d_l_g, d_l_r = train_discriminator_step_p(step=i+1,seed=i+j)
#     disc_loss_generated.append(d_l_g)
#     disc_loss_real.append(d_l_r)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             labels = get_data_batch(train_p, batch_size, seed=i+j)[:,-label_dim:] # updated for class
#             loss = combined_model.train_on_batch([z, labels], [-np.ones(batch_size)]) # updated for class
#         else:
#             loss = combined_model.train_on_batch(z, [-np.ones(batch_size)])
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_p['Class']==1) - 1 # test using all of the actual claim data - 1
#         x = get_data_batch(train_no_label_p, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracy( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
        
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         # K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries   
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_p) - len(X_train_p)
# new_z = np.random.normal(size=(samples,rand_dim))
# new_g_z = generator_model.predict(new_z)

In [None]:
# wgan_samples_p = pd.DataFrame(new_g_z, columns=data_cols)
# # Saving the dataset
# wgan_samples_p.to_csv('dandoww_wgan_primary.csv',index=False)

In [None]:
# Loading the dataset
wgan_samples_p = pd.read_csv('../input/project/dandoww_wgan_primary.csv')
wgan_samples_p.head()

In [None]:
X_wgan_p = X_train_p.append(wgan_samples_p, ignore_index = True)
X_wgan_p.describe()

In [None]:
X_train_p.describe()

In [None]:
ones = np.ones((len(wgan_samples_p),))
one = pd.Series(ones)
y_wgan_p = y_train_p.append(one, ignore_index=True)
# y_wgan_p

#### Excess Business

In [None]:
X_encoded_w_classes_e = X_train_e.copy()
X_encoded_w_classes_e['Class'] = y_train_e
X_encoded_w_classes_e.head()

In [None]:
# Finding rows with class '1'
train_e = X_encoded_w_classes_e.loc[ X_encoded_w_classes_e.Class == 1 ].copy()
claim_w_classes_e = train_e.copy()
claim_w_classes_e['Class'] = y_train_e
# claim_w_classes_e.head()
train_e = claim_w_classes_e.copy().reset_index(drop=True)
# train_e.head()

label_cols = ['Class']
data_cols = [ i for i in train_e.columns if i not in label_cols ]
train_no_label_e = train_e.copy()
train_no_label_e.drop('Class', axis=1, inplace=True)
# train_no_label_e = train_e[ data_cols ]
train_no_label_e.head()

In [None]:
# rand_dim = len(train_no_label_e.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 128 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 1e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# data_cols = train_no_label_e.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)
    
# label_dim = 0
# with_class = False
# label_cols = []
    
# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'WGAN'
# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# generated_image_tensor = generator_network(generator_input_tensor, data_dim, base_n_count)

# generated_or_real_image_tensor = layers.Input(shape=(data_dim,))
# discriminator_output = critic_network(generated_or_real_image_tensor, data_dim, base_n_count)

# generator_model = models.Model(inputs=[generator_input_tensor], outputs=[generated_image_tensor], name='generator')
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                        outputs=[discriminator_output],
#                                        name='discriminator')

# combined_output = discriminator_model(generator_model(generator_input_tensor))
# combined_model = models.Model(inputs=[generator_input_tensor], outputs=[combined_output], name='combined')

In [None]:
# _z = tf.placeholder(tf.float32, shape=(batch_size, rand_dim))
    
# _labels = None    

# _x = tf.placeholder(tf.float32, shape=(batch_size, data_dim))
# _g_z = generator_model(_z)

# epsilon = tf.placeholder(tf.float32, shape=(batch_size, 1))
    
# x_hat = epsilon * _x + (1.0 - epsilon) * _g_z
# gradients = tf.gradients(discriminator_model(x_hat), [x_hat])
# _gradient_penalty = 10.0 * tf.square(tf.norm(gradients[0], ord=2) - 1.0)

# # calculate discriminator's loss
# _disc_loss_generated = em_loss(tf.ones(batch_size), discriminator_model(_g_z))
# _disc_loss_real = em_loss(tf.ones(batch_size), discriminator_model(_x))
# _disc_loss = _disc_loss_generated - _disc_loss_real + _gradient_penalty

# # update f by taking an SGD step on mini-batch loss LD(f)
# disc_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9).minimize(_disc_loss, var_list=discriminator_model.trainable_weights)

# sess = K.get_session()

# # compile models

# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss=[em_loss])

# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# print('pre-training the critic...')
# K.set_learning_phase(1) # 1 = train
# for i in range(critic_pre_train_steps):
#     if i%20==0:
#         print('Step: {} of {} critic pre-training.'.format(i, critic_pre_train_steps))
#         loss = train_discriminator_step_e(step=0,seed=i)

# print('Last batch of critic pre-training disc_loss: {}.'.format(loss))

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1) # 1 = train
    
#     # train the discriminator
#     for j in range(k_d):
#         d_l_g, d_l_r = train_discriminator_step_e(step=i+1,seed=i+j)
#     disc_loss_generated.append(d_l_g)
#     disc_loss_real.append(d_l_r)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             labels = get_data_batch(train_e, batch_size, seed=i+j)[:,-label_dim:] # updated for class
#             loss = combined_model.train_on_batch([z, labels], [-np.ones(batch_size)]) # updated for class
#         else:
#             loss = combined_model.train_on_batch(z, [-np.ones(batch_size)])
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_e['Class']==1) # test using all of the actual claim data
#         x = get_data_batch(train_no_label_e, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracy( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
        
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         # K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries   
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_e) - len(X_train_e)
# new_z = np.random.normal(size=(samples,rand_dim))
# new_g_z = generator_model.predict(new_z)

In [None]:
# wgan_samples_e = pd.DataFrame(new_g_z, columns=data_cols)
# # Saving the dataset
# wgan_samples_e.to_csv('dandoww_wgan_excess.csv',index=False)

In [None]:
# Loading the dataset
wgan_samples_e = pd.read_csv('../input/project/dandoww_wgan_excess.csv')
wgan_samples_e.head()

In [None]:
X_wgan_e = X_train_e.append(wgan_samples_e, ignore_index = True)
X_wgan_e.describe()

In [None]:
X_train_e.describe()

In [None]:
ones = np.ones((len(wgan_samples_e),))
one = pd.Series(ones)
y_wgan_e = y_train_e.append(one, ignore_index=True)
# y_wgan_e

### XGBoost (WGAN)

#### Primary Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_wgan_p, y_wgan_p, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0,
                          learning_rate=0.01,
                          max_depth=4,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_wgan_p, y_wgan_p, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_p)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_p, predictions)
report = classification_report(y_test_p, predictions)
roc_auc = roc_auc_score(y_test_p, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_p)
probabilities = y_prob[:,1]

fpr_wgan_p, tpr_wgan_p, _ = roc_curve(y_test_p, probabilities)
roc_display_wgan_p = RocCurveDisplay(fpr=fpr_wgan_p, tpr=tpr_wgan_p)
roc_auc_wgan_p = auc(fpr_wgan_p, tpr_wgan_p)

prec_wgan_p, recall_wgan_p, _ = precision_recall_curve(y_test_p, probabilities)
pr_display_wgan_p = PrecisionRecallDisplay(precision=prec_wgan_p, recall=recall_wgan_p)
pr_auc_wgan_p = auc(recall_wgan_p, prec_wgan_p)

print('ROC AUC:', roc_auc_wgan_p)
print('Precision-Recall AUC:', pr_auc_wgan_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_wgan_p.plot(ax=ax1)
pr_display_wgan_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_p, y_test_p, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=0,
                          learning_rate=0.01,
                          max_depth=4,
                          reg_lambda=1,
                          scale_pos_weight=3,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_wgan_p, y_wgan_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_wgan_e, y_wgan_e, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=0.1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=5,
                          scale_pos_weight=2,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_wgan_e, y_wgan_e, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_e)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_e, predictions)
report = classification_report(y_test_e, predictions)
roc_auc = roc_auc_score(y_test_e, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_e)
probabilities = y_prob[:,1]

fpr_wgan_e, tpr_wgan_e, _ = roc_curve(y_test_e, probabilities)
roc_display_wgan_e = RocCurveDisplay(fpr=fpr_wgan_e, tpr=tpr_wgan_e)
roc_auc_wgan_e = auc(fpr_wgan_e, tpr_wgan_e)

prec_wgan_e, recall_wgan_e, _ = precision_recall_curve(y_test_e, probabilities)
pr_display_wgan_e = PrecisionRecallDisplay(precision=prec_wgan_e, recall=recall_wgan_e)
pr_auc_wgan_e = auc(recall_wgan_e, prec_wgan_e)

print('ROC AUC:', roc_auc_wgan_e)
print('Precision-Recall AUC:', pr_auc_wgan_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_wgan_e.plot(ax=ax1)
pr_display_wgan_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_e, y_test_e, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=0.1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=5,
                          scale_pos_weight=2,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_wgan_e, y_wgan_e)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


# WCGAN

#### Primary Business

In [None]:
X_encoded_w_classes_p = X_train_p.copy()
X_encoded_w_classes_p['Class'] = y_train_p
X_encoded_w_classes_p.head()

In [None]:
# Finding rows with class '1'
train_p = X_encoded_w_classes_p.loc[ X_encoded_w_classes_p.Class == 1 ].copy()
claim_w_classes_p = train_p.copy()
claim_w_classes_p['Class'] = y_train_p
# claim_w_classes_p.head()
train_p = claim_w_classes_p.copy().reset_index(drop=True)
# train_p.head()

label_cols = ['Class']
data_cols = [ i for i in train_p.columns if i not in label_cols ]
train_no_label_p = train_p.copy()
train_no_label_p.drop('Class', axis=1, inplace=True)
# train_no_label_p = train_p[ data_cols ]
train_no_label_p.head()

In [None]:
# rand_dim = len(train_no_label_p.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 32 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 1e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# data_cols = train_no_label_p.columns
# data_cols_w_class = train_p.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)
    
# label_dim = len(label_cols)
# with_class = True
# print('label_dim: ', label_dim)
# print('label_cols: ', label_cols)
    
# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'WCGAN'

# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# labels_tensor = layers.Input(shape=(label_dim,)) # updated for class
# generated_image_tensor = generator_network_w_label(generator_input_tensor, labels_tensor, data_dim, label_dim, base_n_count) # updated for class

# generated_or_real_image_tensor = layers.Input(shape=(data_dim + label_dim,)) # updated for class

# discriminator_output = critic_network(generated_or_real_image_tensor, data_dim + label_dim, base_n_count) # updated for class

# generator_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[generated_image_tensor], name='generator') # updated for class
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                    outputs=[discriminator_output],
#                                    name='discriminator')

# combined_output = discriminator_model(generator_model([generator_input_tensor, labels_tensor])) # updated for class
# combined_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[combined_output], name='combined') # updated for class

In [None]:
# _z = tf.placeholder(tf.float32, shape=(batch_size, rand_dim))
    
# _labels = None    

# _x = tf.placeholder(tf.float32, shape=(batch_size, data_dim + label_dim)) 
# _labels = tf.placeholder(tf.float32, shape=(batch_size, label_dim)) # updated for class
# _g_z = generator_model(inputs=[_z, _labels]) # updated for class

# epsilon = tf.placeholder(tf.float32, shape=(batch_size, 1))
    
# x_hat = epsilon * _x + (1.0 - epsilon) * _g_z
# gradients = tf.gradients(discriminator_model(x_hat), [x_hat])
# _gradient_penalty = 10.0 * tf.square(tf.norm(gradients[0], ord=2) - 1.0)

# # calculate discriminator's loss
# _disc_loss_generated = em_loss(tf.ones(batch_size), discriminator_model(_g_z))
# _disc_loss_real = em_loss(tf.ones(batch_size), discriminator_model(_x))
# _disc_loss = _disc_loss_generated - _disc_loss_real + _gradient_penalty

# # update f by taking an SGD step on mini-batch loss LD(f)
# disc_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9).minimize(_disc_loss, var_list=discriminator_model.trainable_weights)

# sess = K.get_session()

# # compile models

# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss=[em_loss])

# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# print('pre-training the critic...')
# K.set_learning_phase(1) # 1 = train
# for i in range(critic_pre_train_steps):
#     if i%20==0:
#         print('Step: {} of {} critic pre-training.'.format(i, critic_pre_train_steps))
#         loss = train_discriminator_step_p(step=0,seed=i)

# print('Last batch of critic pre-training disc_loss: {}.'.format(loss))

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1) # 1 = train
    
#     # train the discriminator
#     for j in range(k_d):
#         d_l_g, d_l_r = train_discriminator_step_p(step=i+1,seed=i+j)
#     disc_loss_generated.append(d_l_g)
#     disc_loss_real.append(d_l_r)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             labels = get_data_batch(train_p, batch_size, seed=i+j)[:,-label_dim:] # updated for class
#             loss = combined_model.train_on_batch([z, labels], [-np.ones(batch_size)]) # updated for class
#         else:
#             loss = combined_model.train_on_batch(z, [-np.ones(batch_size)])
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_p['Class']==1) - 1 # test using all of the actual claim data - 1
#         x = get_data_batch(train_p, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracyConditional( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
        
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         # K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries   
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_p) - len(X_train_p)
# new_z = np.random.normal(size=(samples,rand_dim))
# labels_z = np.ones((samples,1))
# new_g_z = generator_model.predict([new_z, labels_z])

In [None]:
# wcgan_samples_p = pd.DataFrame(new_g_z, columns=data_cols_w_class)
# # Saving the dataset
# wcgan_samples_p.to_csv('dandoww_wcgan_primary.csv',index=False)

In [None]:
# Loading the dataset
wcgan_samples_p = pd.read_csv('../input/project/dandoww_wcgan_primary.csv')
wcgan_samples_p.head()

In [None]:
wcgan_samples_p.drop('Class', axis=1, inplace=True)
wcgan_samples_p.head()

In [None]:
X_wcgan_p = X_train_p.append(wcgan_samples_p, ignore_index = True)
X_wcgan_p.describe()

In [None]:
X_train_p.describe()

In [None]:
ones = np.ones((len(wcgan_samples_p),))
one = pd.Series(ones)
y_wcgan_p = y_train_p.append(one, ignore_index=True)
# y_wcgan_p

#### Excess Business

In [None]:
X_encoded_w_classes_e = X_train_e.copy()
X_encoded_w_classes_e['Class'] = y_train_e
X_encoded_w_classes_e.head()

In [None]:
# Finding rows with class '1'
train_e = X_encoded_w_classes_e.loc[ X_encoded_w_classes_e.Class == 1 ].copy()
claim_w_classes_e = train_e.copy()
claim_w_classes_e['Class'] = y_train_e
# claim_w_classes_p.head()
train_e = claim_w_classes_e.copy().reset_index(drop=True)
# train_e.head()

label_cols = ['Class']
data_cols = [ i for i in train_e.columns if i not in label_cols ]
train_no_label_e = train_e.copy()
train_no_label_e.drop('Class', axis=1, inplace=True)
# train_no_label_e = train_e[ data_cols ]
train_no_label_e.head()

In [None]:
# rand_dim = len(train_no_label_e.columns) # 32 # needs to be ~data_dim
# base_n_count = 128 # 128

# nb_steps = 10000 + 1 # 50000 # Add one for logging of the last interval
# batch_size = 128 # 64

# k_d = 1  # number of critic network updates per adversarial training step
# k_g = 1  # number of generator network updates per adversarial training step
# critic_pre_train_steps = 100 # 100  # number of steps to pre-train the critic before starting adversarial training
# log_interval = 100 # 100  # interval (in steps) at which to log loss summaries and save plots of image samples to disc
# learning_rate = 1e-4 # 5e-5
# # data_dir = 'cache/'
# # generator_model_path, discriminator_model_path, loss_pickle_path = None, None, None
# # show = True 

In [None]:
# data_cols = train_no_label_e.columns
# data_cols_w_class = train_e.columns
# data_dim = len(data_cols)
# print('data_dim: ', data_dim)
# print('data_cols: ', data_cols)
    
# label_dim = len(label_cols)
# with_class = True
# print('label_dim: ', label_dim)
# print('label_cols: ', label_cols)
    
# K.set_learning_phase(1) # 1 = train

# cache_prefix = 'WCGAN'

# generator_input_tensor = layers.Input(shape=(rand_dim, ))
# labels_tensor = layers.Input(shape=(label_dim,)) # updated for class
# generated_image_tensor = generator_network_w_label(generator_input_tensor, labels_tensor, data_dim, label_dim, base_n_count) # updated for class

# generated_or_real_image_tensor = layers.Input(shape=(data_dim + label_dim,)) # updated for class

# discriminator_output = critic_network(generated_or_real_image_tensor, data_dim + label_dim, base_n_count) # updated for class

# generator_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[generated_image_tensor], name='generator') # updated for class
# discriminator_model = models.Model(inputs=[generated_or_real_image_tensor],
#                                    outputs=[discriminator_output],
#                                    name='discriminator')

# combined_output = discriminator_model(generator_model([generator_input_tensor, labels_tensor])) # updated for class
# combined_model = models.Model(inputs=[generator_input_tensor, labels_tensor], outputs=[combined_output], name='combined') # updated for class

In [None]:
# _z = tf.placeholder(tf.float32, shape=(batch_size, rand_dim))
    
# _labels = None    

# _x = tf.placeholder(tf.float32, shape=(batch_size, data_dim + label_dim)) 
# _labels = tf.placeholder(tf.float32, shape=(batch_size, label_dim)) # updated for class
# _g_z = generator_model(inputs=[_z, _labels]) # updated for class

# epsilon = tf.placeholder(tf.float32, shape=(batch_size, 1))
    
# x_hat = epsilon * _x + (1.0 - epsilon) * _g_z
# gradients = tf.gradients(discriminator_model(x_hat), [x_hat])
# _gradient_penalty = 10.0 * tf.square(tf.norm(gradients[0], ord=2) - 1.0)

# # calculate discriminator's loss
# _disc_loss_generated = em_loss(tf.ones(batch_size), discriminator_model(_g_z))
# _disc_loss_real = em_loss(tf.ones(batch_size), discriminator_model(_x))
# _disc_loss = _disc_loss_generated - _disc_loss_real + _gradient_penalty

# # update f by taking an SGD step on mini-batch loss LD(f)
# disc_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, beta2=0.9).minimize(_disc_loss, var_list=discriminator_model.trainable_weights)

# sess = K.get_session()

# # compile models

# adam = optimizers.Adam(lr=learning_rate, beta_1=0.5, beta_2=0.9)

# discriminator_model.trainable = False
# combined_model.compile(optimizer=adam, loss=[em_loss])

# combined_loss, disc_loss_generated, disc_loss_real, xgb_losses = [], [], [], []

# # print(generator_model.summary())
# # print(discriminator_model.summary())
# # print(combined_model.summary())

In [None]:
# print('pre-training the critic...')
# K.set_learning_phase(1) # 1 = train
# for i in range(critic_pre_train_steps):
#     if i%20==0:
#         print('Step: {} of {} critic pre-training.'.format(i, critic_pre_train_steps))
#         loss = train_discriminator_step_e(step=0,seed=i)

# print('Last batch of critic pre-training disc_loss: {}.'.format(loss))

In [None]:
# for i in range(0, nb_steps):
#     K.set_learning_phase(1) # 1 = train
    
#     # train the discriminator
#     for j in range(k_d):
#         d_l_g, d_l_r = train_discriminator_step_e(step=i+1,seed=i+j)
#     disc_loss_generated.append(d_l_g)
#     disc_loss_real.append(d_l_r)
    
#     # train the generator
#     for j in range(k_g):
#         np.random.seed(i+j)
#         z = np.random.normal(size=(batch_size, rand_dim))
#         if with_class:
#             labels = get_data_batch(train_e, batch_size, seed=i+j)[:,-label_dim:] # updated for class
#             loss = combined_model.train_on_batch([z, labels], [-np.ones(batch_size)]) # updated for class
#         else:
#             loss = combined_model.train_on_batch(z, [-np.ones(batch_size)])
#     combined_loss.append(loss)
    
#     # Determine xgb loss each step, after training generator and discriminator
#     if not i % 10: # 2x faster than testing each step...
#         K.set_learning_phase(0) # 0 = test
#         test_size = np.sum(train_e['Class']==1) # test using all of the actual claim data
#         x = get_data_batch(train_e, test_size, seed=i)
#         z = np.random.normal(size=(test_size, rand_dim))
#         if with_class:
#             labels = x[:,-label_dim:]
#             g_z = generator_model.predict([z, labels])
#         else:
#             g_z = generator_model.predict(z)
#         xgb_loss = CheckAccuracyConditional( x, g_z, data_cols, label_cols, seed=0, with_class=with_class, data_dim=data_dim )
#         xgb_losses = np.append(xgb_losses, xgb_loss)
        
#     if not i % log_interval:
#         print('Step: {} of {}.'.format(i, nb_steps))
#         # K.set_learning_phase(0) # 0 = test
                        
#         # loss summaries   
#         print( 'Losses: G, D Gen, D Real, Xgb: {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(combined_loss[-1], disc_loss_generated[-1], disc_loss_real[-1], xgb_losses[-1]) )
#         print( 'D Real - D Gen: {:.4f}'.format(disc_loss_real[-1]-disc_loss_generated[-1]) )

In [None]:
# fig = plt.figure(figsize=(20,5))
# ax1 = fig.add_subplot(1, 3, 1)
# ax2 = fig.add_subplot(1, 3, 2)
# ax3 = fig.add_subplot(1, 3, 3)

# ax1.plot(combined_loss)
# ax1.set_title('Generator Loss')
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss')

# ax2.plot(disc_loss_generated)
# ax2.set_title('Discriminator Loss of Generated Data')
# ax2.set_xlabel('Step')
# ax2.set_ylabel('Loss')

# ax3.plot(disc_loss_real)
# ax3.set_title('Discriminator Loss of Real Data')
# ax3.set_xlabel('Step')
# ax3.set_ylabel('Loss')

# plt.show()

In [None]:
# # Predicting the generated data
# samples = len(X_train_res_e) - len(X_train_e)
# new_z = np.random.normal(size=(samples,rand_dim))
# labels_z = np.ones((samples,1))
# new_g_z = generator_model.predict([new_z, labels_z])

In [None]:
# wcgan_samples_e = pd.DataFrame(new_g_z, columns=data_cols_w_class)
# # Saving the dataset
# wcgan_samples_e.to_csv('dandoww_wcgan_excess.csv',index=False)

In [None]:
# Loading the dataset
wcgan_samples_e = pd.read_csv('../input/project/dandoww_wcgan_excess.csv')
wcgan_samples_e.head()

In [None]:
wcgan_samples_e.drop('Class', axis=1, inplace=True)
wcgan_samples_e.head()

In [None]:
X_wcgan_e = X_train_e.append(wcgan_samples_e, ignore_index = True)
X_wcgan_e.describe()

In [None]:
X_train_e.describe()

In [None]:
ones = np.ones((len(wcgan_samples_e),))
one = pd.Series(ones)
y_wcgan_e = y_train_e.append(one, ignore_index=True)
# y_wcgan_e

### XGBoost (WCGAN)

#### Primary Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_wcgan_p, y_wcgan_p, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=1,
                          learning_rate=0.05,
                          max_depth=6,
                          reg_lambda=10,
                          scale_pos_weight=3,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_wcgan_p, y_wcgan_p, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_p, y_test_p)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_p)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_p, predictions)
report = classification_report(y_test_p, predictions)
roc_auc = roc_auc_score(y_test_p, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_p)
probabilities = y_prob[:,1]

fpr_wcgan_p, tpr_wcgan_p, _ = roc_curve(y_test_p, probabilities)
roc_display_wcgan_p = RocCurveDisplay(fpr=fpr_wcgan_p, tpr=tpr_wcgan_p)
roc_auc_wcgan_p = auc(fpr_wcgan_p, tpr_wcgan_p)

prec_wcgan_p, recall_wcgan_p, _ = precision_recall_curve(y_test_p, probabilities)
pr_display_wcgan_p = PrecisionRecallDisplay(precision=prec_wcgan_p, recall=recall_wcgan_p)
pr_auc_wcgan_p = auc(recall_wcgan_p, prec_wcgan_p)

print('ROC AUC:', roc_auc_wcgan_p)
print('Precision-Recall AUC:', pr_auc_wcgan_p)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_wcgan_p.plot(ax=ax1)
pr_display_wcgan_p.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_p, y_test_p, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=1,
                          learning_rate=0.05,
                          max_depth=6,
                          reg_lambda=10,
                          scale_pos_weight=3,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_wcgan_p, y_wcgan_p)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


#### Excess Business

In [None]:
# param_grid ={
#     'max_depth': [4, 5, 6],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'gamma': [0, 0.1, 1.0],
#     'reg_lambda': [1.0, 5.0, 10.0],
#     'scale_pos_weight' : [1, 2, 3]
# }

# # AUC since data is imbalance
# # Choose only 90% random subset of the data and for each tree, choose only 50% of the columns to
# # improve the speed and prevent overfitting
# optimal_params = GridSearchCV(xgb.XGBClassifier(objective='binary:logistic', seed=42, 
#                                                 subsample=0.9, colsample_bytree=0.5),
#                              param_grid=param_grid,
#                               scoring='roc_auc',
#                              verbose=2,
# #                               n_jobs=10,
#                              cv=3)

# optimal_params.fit(X_wcgan_e, y_wcgan_e, verbose=False, early_stopping_rounds=10, 
#             eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
# print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                          gamma=1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=10,
                          scale_pos_weight=1,
                           seed=42,
                           subsample=0.9, 
                            colsample_bytree=0.5)
clf_xgb.fit(X_wcgan_e, y_wcgan_e, verbose=True, early_stopping_rounds=10, 
            eval_metric='aucpr', eval_set=[(X_test_e, y_test_e)])

In [None]:
print('Best score:', clf_xgb.best_score)
print('Best iteration:', clf_xgb.best_iteration)

In [None]:
# make predictions for test data
y_pred = clf_xgb.predict(X_test_e)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test_e, predictions)
report = classification_report(y_test_e, predictions)
roc_auc = roc_auc_score(y_test_e, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("ROC AUC:", roc_auc)
print(report)

In [None]:
y_prob = clf_xgb.predict_proba(X_test_e)
probabilities = y_prob[:,1]

fpr_wcgan_e, tpr_wcgan_e, _ = roc_curve(y_test_e, probabilities)
roc_display_wcgan_e = RocCurveDisplay(fpr=fpr_wcgan_e, tpr=tpr_wcgan_e)
roc_auc_wcgan_e = auc(fpr_wcgan_e, tpr_wcgan_e)

prec_wcgan_e, recall_wcgan_e, _ = precision_recall_curve(y_test_e, probabilities)
pr_display_wcgan_e = PrecisionRecallDisplay(precision=prec_wcgan_e, recall=recall_wcgan_e)
pr_auc_wcgan_e = auc(recall_wcgan_e, prec_wcgan_e)

print('ROC AUC:', roc_auc_wcgan_e)
print('Precision-Recall AUC:', pr_auc_wcgan_e)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

roc_display_wcgan_e.plot(ax=ax1)
pr_display_wcgan_e.plot(ax=ax2)
plt.show()

In [None]:
plot_confusion_matrix(clf_xgb, X_test_e, y_test_e, values_format='d', 
                      display_labels=['Non-claim', 'Claim'])

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                         gamma=1,
                          learning_rate=0.1,
                          max_depth=6,
                          reg_lambda=10,
                          scale_pos_weight=1,
                           seed=42,
                           n_estimators=1)
clf_xgb.fit(X_wcgan_e, y_wcgan_e)
bst=clf_xgb.get_booster()
for importance_type in ('weight', 'gain', 'cover', 'total_gain', 'total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))
    
node_params = {'shape': 'box',
              'style': 'filled, rounded',
              'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
              'style': 'filled',
              'fillcolor': '#e48038'}

xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)


## Comparison Plots

In [None]:
# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
fpr, tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
plt.figure(figsize=(8,6))

plt.plot(fpr, tpr, color='b', linestyle='--', label='AUC: 0.5')
plt.plot(fpr_p, tpr_p, color='r', label='No oversampling, AUC: %.4f' % roc_auc_p)
plt.plot(fpr_ros_p, tpr_ros_p, color='g', label='ROS, AUC: %.4f' % roc_auc_ros_p)
plt.plot(fpr_sm_p, tpr_sm_p, color='k', label='SMOTE, AUC: %.4f' % roc_auc_sm_p)
plt.plot(fpr_gan_p, tpr_gan_p, color='orange', label='GAN, AUC: %.4f' % roc_auc_gan_p)
plt.plot(fpr_cgan_p, tpr_cgan_p, color='c', label='cGAN, AUC: %.4f' % roc_auc_cgan_p)
plt.plot(fpr_wgan_p, tpr_wgan_p, color='m', label='WGAN, AUC: %.4f' % roc_auc_wgan_p)
plt.plot(fpr_wcgan_p, tpr_wcgan_p, color='y', label='WCGAN, AUC: %.4f' % roc_auc_wcgan_p)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC of Primary Business')
plt.legend(loc='lower right')

plt.show()

In [None]:
plt.figure(figsize=(8,6))

plt.plot(fpr, tpr, color='b', linestyle='--', label='AUC: 0.5')
plt.plot(fpr_e, tpr_e, color='r', label='No oversampling, AUC: %.4f' % roc_auc_e)
plt.plot(fpr_ros_e, tpr_ros_e, color='g', label='ROS, AUC: %.4f' % roc_auc_ros_e)
plt.plot(fpr_sm_e, tpr_sm_e, color='k', label='SMOTE, AUC: %.4f' % roc_auc_sm_e)
plt.plot(fpr_gan_e, tpr_gan_e, color='orange', label='GAN, AUC: %.4f' % roc_auc_gan_e)
plt.plot(fpr_cgan_e, tpr_cgan_e, color='c', label='cGAN, AUC: %.4f' % roc_auc_cgan_e)
plt.plot(fpr_wgan_e, tpr_wgan_e, color='m', label='WGAN, AUC: %.4f' % roc_auc_wgan_e)
plt.plot(fpr_wcgan_e, tpr_wcgan_e, color='y', label='WCGAN, AUC: %.4f' % roc_auc_wcgan_e)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC of Excess Business')
plt.legend(loc='lower right')

plt.show()

In [None]:
plt.figure(figsize=(8,6))

plt.plot(recall_p, prec_p, color='r', label='No oversampling, AUC: %.4f' % pr_auc_p)
plt.plot(recall_ros_p, prec_ros_p, color='g', label='ROS, AUC: %.4f' % pr_auc_ros_p)
plt.plot(recall_sm_p, prec_sm_p, color='k', label='SMOTE, AUC: %.4f' % pr_auc_sm_p)
plt.plot(recall_gan_p, prec_gan_p, color='b', label='GAN, AUC: %.4f' % pr_auc_gan_p)
plt.plot(recall_cgan_p, prec_cgan_p, color='c', label='cGAN, AUC: %.4f' % pr_auc_cgan_p)
plt.plot(recall_wgan_p, prec_wgan_p, color='m', label='WGAN, AUC: %.4f' % pr_auc_wgan_p)
plt.plot(recall_wcgan_p, prec_wcgan_p, color='y', label='WCGAN, AUC: %.4f' % pr_auc_wcgan_p)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve of Primary Business')
plt.legend(loc='lower right')

plt.show()

In [None]:
plt.figure(figsize=(8,6))

plt.plot(recall_e, prec_e, color='r', label='No oversampling, AUC: %.4f' % pr_auc_e)
plt.plot(recall_ros_e, prec_ros_e, color='g', label='ROS, AUC: %.4f' % pr_auc_ros_e)
plt.plot(recall_sm_e, prec_sm_e, color='k', label='SMOTE, AUC: %.4f' % pr_auc_sm_e)
plt.plot(recall_gan_e, prec_gan_e, color='b', label='GAN, AUC: %.4f' % pr_auc_gan_e)
plt.plot(recall_cgan_e, prec_cgan_e, color='c', label='cGAN, AUC: %.4f' % pr_auc_cgan_e)
plt.plot(recall_wgan_e, prec_wgan_e, color='m', label='WGAN, AUC: %.4f' % pr_auc_wgan_e)
plt.plot(recall_wcgan_e, prec_wcgan_e, color='y', label='WCGAN, AUC: %.4f' % pr_auc_wcgan_e)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve of Excess Business')
plt.legend(loc='lower right')

plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

In [None]:
projected = pca.fit_transform(gan_samples_p)
plt.scatter(projected[:, 0], projected[:, 1])
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.plot

In [None]:
projected = pca.fit_transform(wgan_samples_p)
plt.scatter(projected[:, 0], projected[:, 1])
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.plot