In [1]:
import pandas as pd
from tabulate import tabulate

# Create a DataFrame with the initial specified data
data_1 = {
    'Method': ['RUS_1'], 'n_neg': [803682], 'n_pos': [8118],
    'n_neg:n_pos': ['99:1'], 'Decision Threshold': [0.01],
    'AUC': [0.8204], 'TPR': [0.7444], 'TNR': [0.7444], 'G-Mean' : [0.7444]
}

# Additional metrics to be added
data_2 = {
    'Method': ['RUS_2'], 'n_neg': [32472], 'n_pos': [8118],  
    'n_neg:n_pos': ['4:1'], 'Decision Threshold': [0.268],  
    'AUC': [0.8315],  'TPR': [0.6336], 'TNR': [0.8433],'G-Mean' : [0.7309]
}

data_3 = {
    'Method': ['RUS_3'], 'n_neg': [12177], 'n_pos': [8118],  
    'n_neg:n_pos': ['3:2'], 'Decision Threshold': [0.4200],  
    'AUC': [0.8354], 'TPR': [0.7611], 'TNR': [0.7486], 'G-Mean' : [0.7548]
}
data_4 = {
    'Method': ['RUS_4'], 'n_neg': [8118], 'n_pos': [8118],  
    'n_neg:n_pos': ['1:1'], 'Decision Threshold': [0.4970],  
    'AUC': [0.8378], 'TPR': [0.7573], 'TNR': [0.7598], 'G-Mean' : [0.7585]
}
data_5 = {
    'Method': ['RUS_5'], 'n_neg': [5412], 'n_pos': [8118],  
    'n_neg:n_pos': ['2:3'], 'Decision Threshold': [0.5730],  
    'AUC': [0.8289], 'TPR': [0.7727], 'TNR': [0.7393], 'G-Mean' : [0.7558]
}

df_1 = pd.DataFrame(data_1)
# Update DataFrame with new data
df_2 = pd.DataFrame(data_2)
df = pd.concat([df_1, df_2, pd.DataFrame(data_3), pd.DataFrame(data_4),pd.DataFrame(data_5)]).reset_index(drop=True)

# Format numbers with commas for thousands and percentages/decimals appropriately
for col in ['AUC', 'TPR', 'TNR']:
    df[col] = df[col].apply(lambda x: f"{x:.4f}")

df['n_neg'] = df['n_neg'].apply(lambda x: f"{x:,}")
df['n_pos'] = df['n_pos'].apply(lambda x: f"{x:,}")

# Print the DataFrame using tabulate
print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))


+----------+---------+---------+---------------+----------------------+--------+--------+--------+----------+
| Method   | n_neg   | n_pos   | n_neg:n_pos   |   Decision Threshold |    AUC |    TPR |    TNR |   G-Mean |
| RUS_1    | 803,682 | 8,118   | 99:1          |                0.01  | 0.8204 | 0.7444 | 0.7444 |   0.7444 |
+----------+---------+---------+---------------+----------------------+--------+--------+--------+----------+
| RUS_2    | 32,472  | 8,118   | 4:1           |                0.268 | 0.8315 | 0.6336 | 0.8433 |   0.7309 |
+----------+---------+---------+---------------+----------------------+--------+--------+--------+----------+
| RUS_3    | 12,177  | 8,118   | 3:2           |                0.42  | 0.8354 | 0.7611 | 0.7486 |   0.7548 |
+----------+---------+---------+---------------+----------------------+--------+--------+--------+----------+
| RUS_4    | 8,118   | 8,118   | 1:1           |                0.497 | 0.8378 | 0.7573 | 0.7598 |   0.7585 |
+---------

In [2]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
df_final = pd.read_csv("data_set_result_yearly.csv")
# df_aggregated_final = pd.read_csv("data_set_result_aggregated_by_npi.csv")

In [3]:
df_final.columns

Index(['rndrng_npi', 'tot_benes_mean', 'tot_benes_sum', 'tot_benes_median',
       'tot_benes_std', 'tot_benes_min', 'tot_benes_max', 'tot_srvcs_mean',
       'tot_srvcs_sum', 'tot_srvcs_median',
       ...
       'rndrng_prvdr_type_advanced heart failure and transplant cardiology',
       'rndrng_prvdr_type_clinical cardiac electrophysiology',
       'rndrng_prvdr_type_medical toxicology',
       'rndrng_prvdr_type_oral surgery (dentist only)',
       'rndrng_prvdr_type_hematopoietic cell transplantation and cellular therapy',
       'rndrng_prvdr_type_medical genetics and genomics',
       'rndrng_prvdr_type_pharmacy',
       'rndrng_prvdr_type_undersea and hyperbaric medicine',
       'rndrng_prvdr_type_adult congenital heart disease',
       'rndrng_prvdr_type_micrographic dermatologic surgery'],
      dtype='object', length=153)

In [4]:
X = df_final.drop(['fraud', 'rndrng_npi'], axis=1)
y = df_final['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Evaluate the model
predictions = model.predict(X_test_scaled)
print("ROC AUC Score:", roc_auc_score(y_test, predictions))
print(classification_report(y_test, (predictions > 0.5).astype(int)))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m194984/194984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 686us/step - accuracy: 0.9983 - loss: 0.0112 - val_accuracy: 0.9991 - val_loss: 0.0094
Epoch 2/50
[1m194984/194984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 691us/step - accuracy: 0.9990 - loss: 0.0089 - val_accuracy: 0.9991 - val_loss: 0.0081
Epoch 3/50
[1m194984/194984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 692us/step - accuracy: 0.9990 - loss: 0.0089 - val_accuracy: 0.9991 - val_loss: 0.0101
Epoch 4/50
[1m194984/194984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 706us/step - accuracy: 0.9990 - loss: 0.0102 - val_accuracy: 0.9991 - val_loss: 0.0080
Epoch 5/50
[1m194984/194984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 701us/step - accuracy: 0.9990 - loss: 0.0098 - val_accuracy: 0.9991 - val_loss: 0.0104
Epoch 6/50
[1m194984/194984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 701us/step - accuracy: 0.9990 - loss: 0.0102 - val_accu

<keras.src.callbacks.history.History at 0x23eec14d550>

[1m54163/54163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 383us/step
ROC AUC Score: 0.6923290024577556


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1731563
           1       0.00      0.00      0.00      1624

    accuracy                           1.00   1733187
   macro avg       0.50      0.50      0.50   1733187
weighted avg       1.00      1.00      1.00   1733187



  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
df_final.shape
df_final[df_final['fraud'] == 1].shape
# df_final[df_final['fraud'] == 0].shape
df_final.columns

majority = df_final[df_final['fraud'] == 0]
minority = df_final[df_final['fraud'] == 1]

(8665935, 153)

(8118, 153)

Index(['rndrng_npi', 'tot_benes_mean', 'tot_benes_sum', 'tot_benes_median',
       'tot_benes_std', 'tot_benes_min', 'tot_benes_max', 'tot_srvcs_mean',
       'tot_srvcs_sum', 'tot_srvcs_median',
       ...
       'rndrng_prvdr_type_advanced heart failure and transplant cardiology',
       'rndrng_prvdr_type_clinical cardiac electrophysiology',
       'rndrng_prvdr_type_medical toxicology',
       'rndrng_prvdr_type_oral surgery (dentist only)',
       'rndrng_prvdr_type_hematopoietic cell transplantation and cellular therapy',
       'rndrng_prvdr_type_medical genetics and genomics',
       'rndrng_prvdr_type_pharmacy',
       'rndrng_prvdr_type_undersea and hyperbaric medicine',
       'rndrng_prvdr_type_adult congenital heart disease',
       'rndrng_prvdr_type_micrographic dermatologic surgery'],
      dtype='object', length=153)

In [5]:
#RUS to majority class to be equal to 99 * minority class
ratio = 99
# Here, we match the number of instances in the minority class
majority_undersampled = majority.sample(n=ratio * len(minority), random_state=42)  # `random_state` ensures reproducibility
# Concatenate the minority class with the undersampled majority class
df_RUS_1 = pd.concat([majority_undersampled, minority])
# Shuffle the dataframe to prevent the model from learning any order
df_RUS_1 = df_RUS_1.sample(frac=1, random_state=42).reset_index(drop=True)
# Verify the number of instances in each class
print(df_RUS_1['fraud'].value_counts())
df_RUS_1.shape

fraud
0    803682
1      8118
Name: count, dtype: int64


(811800, 153)

In [73]:
X = df_RUS_1.drop(['fraud', 'rndrng_npi'], axis=1)
y = df_RUS_1['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.01
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m18266/18266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1ms/step - accuracy: 0.9877 - loss: 0.0695 - val_accuracy: 0.9903 - val_loss: 0.0483
Epoch 2/50
[1m18266/18266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 1ms/step - accuracy: 0.9902 - loss: 0.0508 - val_accuracy: 0.9903 - val_loss: 0.0484
Epoch 3/50
[1m18266/18266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step - accuracy: 0.9900 - loss: 0.0512 - val_accuracy: 0.9903 - val_loss: 0.0484
Epoch 4/50
[1m18266/18266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1ms/step - accuracy: 0.9900 - loss: 0.0510 - val_accuracy: 0.9903 - val_loss: 0.0493
Epoch 5/50
[1m18266/18266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1ms/step - accuracy: 0.9898 - loss: 0.0517 - val_accuracy: 0.9903 - val_loss: 0.0489
Epoch 6/50
[1m18266/18266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1ms/step - accuracy: 0.9899 - loss: 0.0513 - val_accuracy: 0.9903 - val_loss: 0.048

<keras.src.callbacks.history.History at 0x20352c0f010>

[1m5074/5074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 722us/step
ROC AUC Score: 0.8204611312592616
True Positive Rate (TPR): 0.7444581280788177
True Negative Rate (TNR): 0.7444131992832969
Geometric Mean (G-Mean): 0.7444356633421098
              precision    recall  f1-score   support

           0       1.00      0.74      0.85    160736
           1       0.03      0.74      0.06      1624

    accuracy                           0.74    162360
   macro avg       0.51      0.74      0.45    162360
weighted avg       0.99      0.74      0.84    162360



In [52]:

# # thresholds = [0.001, 0.005, 0.01, 0.015, 0.2]

# from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# # Predict probabilities for the test set
# predictions = model.predict(X_test_scaled)

# # Define thresholds to test
# thresholds = [0.001, 0.005, 0.01, 0.015, 0.2]

# # Iterate over each threshold, calculate metrics, and print them
# for thresh in thresholds:
#     # Binarize predictions based on the current threshold
#     binary_predictions = (predictions > thresh).astype(int)
    
#     # Generate the confusion matrix
#     cm = confusion_matrix(y_test, binary_predictions)
#     TN, FP, FN, TP = cm.ravel()  # Flatten the matrix to get TN, FP, FN, TP
    
#     # Calculate TPR and TNR
#     TPR = TP / (TP + FN)
#     TNR = TN / (TN + FP)
    
#     # Print the results
#     print(f"Threshold: {thresh}")
#     print("Confusion Matrix:")
#     print(cm)
#     print(f"True Positive Rate (TPR): {TPR:.4f}")
#     print(f"True Negative Rate (TNR): {TNR:.4f}")
#     print("Classification Report:")
#     print(classification_report(y_test, binary_predictions))
#     print("ROC AUC Score:", roc_auc_score(y_test, predictions))
#     print("\n" + "-"*50 + "\n")


In [6]:
#RUS to majority class to be equal to 4 * minority class
ratio = 4
# Here, we match the number of instances in the minority class
majority_undersampled = majority.sample(n=ratio*len(minority), random_state=101)  # `random_state` ensures reproducibility
# Concatenate the minority class with the undersampled majority class
df_RUS_2 = pd.concat([majority_undersampled, minority])
# Shuffle the dataframe to prevent the model from learning any order
df_RUS_2 = df_RUS_2.sample(frac=1, random_state=42).reset_index(drop=True)
# Verify the number of instances in each class
print(df_RUS_2['fraud'].value_counts())
df_RUS_2.shape

fraud
0    32472
1     8118
Name: count, dtype: int64


(40590, 153)

In [59]:
X = df_RUS_2.drop(['fraud', 'rndrng_npi'], axis=1)
y = df_RUS_2['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.2680
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate: TP / (TP + FN)
TNR = TN / (TN + FP)  # True Negative Rate: TN / (TN + FP)

# Display TPR and TNR
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7776 - loss: 0.5395 - val_accuracy: 0.8045 - val_loss: 0.4150
Epoch 2/50
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8088 - loss: 0.4346 - val_accuracy: 0.8107 - val_loss: 0.3986
Epoch 3/50
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8151 - loss: 0.4215 - val_accuracy: 0.8162 - val_loss: 0.3880
Epoch 4/50
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8135 - loss: 0.4149 - val_accuracy: 0.8211 - val_loss: 0.3870
Epoch 5/50
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8222 - loss: 0.4045 - val_accuracy: 0.8270 - val_loss: 0.3846
Epoch 6/50
[1m914/914[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8213 - loss: 0.4085 - val_accuracy: 0.8276 - val_loss: 0.3808
Epoch 7/50
[1m914/914[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x203446b6d10>

[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 477us/step
ROC AUC Score: 0.8315705592581861
True Positive Rate (TPR): 0.6336206896551724
True Negative Rate (TNR): 0.8433939020634432
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      6494
           1       0.50      0.63      0.56      1624

    accuracy                           0.80      8118
   macro avg       0.70      0.74      0.72      8118
weighted avg       0.82      0.80      0.81      8118



In [7]:
#RUS to majority class to be equal to 1.5 * minority class
ratio = 1.5
# Here, we match the number of instances in the minority class
majority_undersampled = majority.sample(n=int(ratio*len(minority)), random_state=101)  # `random_state` ensures reproducibility
# Concatenate the minority class with the undersampled majority class
df_RUS_3 = pd.concat([majority_undersampled, minority])
# Shuffle the dataframe to prevent the model from learning any order
df_RUS_3 = df_RUS_3.sample(frac=1, random_state=42).reset_index(drop=True)
# Verify the number of instances in each class
print(df_RUS_3['fraud'].value_counts())
df_RUS_3.shape

fraud
0    12177
1     8118
Name: count, dtype: int64


(20295, 153)

In [63]:
X = df_RUS_3.drop(['fraud', 'rndrng_npi'], axis=1)
y = df_RUS_3['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.4200
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate: TP / (TP + FN)
TNR = TN / (TN + FP)  # True Negative Rate: TN / (TN + FP)

# Display TPR and TNR
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5809 - loss: 0.7333 - val_accuracy: 0.7007 - val_loss: 0.5896
Epoch 2/50
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6621 - loss: 0.6210 - val_accuracy: 0.7063 - val_loss: 0.5635
Epoch 3/50
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7150 - loss: 0.5748 - val_accuracy: 0.7204 - val_loss: 0.5507
Epoch 4/50
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7252 - loss: 0.5544 - val_accuracy: 0.7223 - val_loss: 0.5441
Epoch 5/50
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7312 - loss: 0.5556 - val_accuracy: 0.7211 - val_loss: 0.5412
Epoch 6/50
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 854us/step - accuracy: 0.7331 - loss: 0.5400 - val_accuracy: 0.7254 - val_loss: 0.5359
Epoch 7/50
[1m457/457[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x20344da1b10>

[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
ROC AUC Score: 0.8354419336239771
True Positive Rate (TPR): 0.7610837438423645
True Negative Rate (TNR): 0.7486652977412731
              precision    recall  f1-score   support

           0       0.82      0.75      0.78      2435
           1       0.67      0.76      0.71      1624

    accuracy                           0.75      4059
   macro avg       0.75      0.75      0.75      4059
weighted avg       0.76      0.75      0.76      4059



In [8]:
#RUS to majority class to be equal to 1 * minority class
ratio = 1
# Here, we match the number of instances in the minority class
majority_undersampled = majority.sample(n=int(ratio*len(minority)), random_state=77)  # `random_state` ensures reproducibility
# Concatenate the minority class with the undersampled majority class
df_RUS_4 = pd.concat([majority_undersampled, minority])
# Shuffle the dataframe to prevent the model from learning any order
df_RUS_4 = df_RUS_4.sample(frac=1, random_state=42).reset_index(drop=True)
# Verify the number of instances in each class
print(df_RUS_4['fraud'].value_counts())
df_RUS_4.shape

fraud
1    8118
0    8118
Name: count, dtype: int64


(16236, 153)

In [66]:
X = df_RUS_4.drop(['fraud', 'rndrng_npi'], axis=1)
y = df_RUS_4['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.4970
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate: TP / (TP + FN)
TNR = TN / (TN + FP)  # True Negative Rate: TN / (TN + FP)

# Display TPR and TNR
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5421 - loss: 0.7724 - val_accuracy: 0.6975 - val_loss: 0.6138
Epoch 2/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 917us/step - accuracy: 0.6669 - loss: 0.6318 - val_accuracy: 0.7259 - val_loss: 0.5770
Epoch 3/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6926 - loss: 0.6009 - val_accuracy: 0.7159 - val_loss: 0.5612
Epoch 4/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7082 - loss: 0.5780 - val_accuracy: 0.7259 - val_loss: 0.5527
Epoch 5/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7194 - loss: 0.5702 - val_accuracy: 0.7159 - val_loss: 0.5529
Epoch 6/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7319 - loss: 0.5504 - val_accuracy: 0.7275 - val_loss: 0.5492
Epoch 7/50
[1m366/366[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x2035027db10>

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
ROC AUC Score: 0.8378041280424178
True Positive Rate (TPR): 0.7573891625615764
True Negative Rate (TNR): 0.7598522167487685
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      1624
           1       0.76      0.76      0.76      1624

    accuracy                           0.76      3248
   macro avg       0.76      0.76      0.76      3248
weighted avg       0.76      0.76      0.76      3248



In [9]:
#RUS to majority class to be equal to 1.5 * minority class
ratio = 2/3
# Here, we match the number of instances in the minority class
majority_undersampled = majority.sample(n=int(ratio*len(minority)), random_state=707)  # `random_state` ensures reproducibility
# Concatenate the minority class with the undersampled majority class
df_RUS_5 = pd.concat([majority_undersampled, minority])
# Shuffle the dataframe to prevent the model from learning any order
df_RUS_5 = df_RUS_5.sample(frac=1, random_state=42).reset_index(drop=True)
# Verify the number of instances in each class
print(df_RUS_5['fraud'].value_counts())
df_RUS_5.shape

fraud
1    8118
0    5412
Name: count, dtype: int64


(13530, 153)

In [79]:
X = df_RUS_5.drop(['fraud', 'rndrng_npi'], axis=1)
y = df_RUS_5['fraud']

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the neural network model
model = Sequential([
    Dense(32, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Predict probabilities for the test set
predictions = model.predict(X_test_scaled)

# Evaluate the model
print("ROC AUC Score:", roc_auc_score(y_test, predictions.flatten()))

# Adjust threshold and compute binary outcomes
threshold = 0.5730
predicted_classes = (predictions > threshold).astype(int)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
TN, FP, FN, TP = cm.ravel()

# Calculate TPR and TNR
TPR = TP / (TP + FN)  # True Positive Rate
TNR = TN / (TN + FP)  # True Negative Rate

# Calculate the G-Mean
G_mean = np.sqrt(TPR * TNR)

# Display TPR, TNR, and G-Mean
print("True Positive Rate (TPR):", TPR)
print("True Negative Rate (TNR):", TNR)
print("Geometric Mean (G-Mean):", G_mean)

# Detailed classification report
print(classification_report(y_test, predicted_classes))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5849 - loss: 0.7509 - val_accuracy: 0.6999 - val_loss: 0.5852
Epoch 2/50
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6781 - loss: 0.6264 - val_accuracy: 0.7285 - val_loss: 0.5521
Epoch 3/50
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6926 - loss: 0.5904 - val_accuracy: 0.7331 - val_loss: 0.5367
Epoch 4/50
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7139 - loss: 0.5695 - val_accuracy: 0.7424 - val_loss: 0.5269
Epoch 5/50
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7250 - loss: 0.5511 - val_accuracy: 0.7442 - val_loss: 0.5262
Epoch 6/50
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7275 - loss: 0.5541 - val_accuracy: 0.7498 - val_loss: 0.5182
Epoch 7/50
[1m305/305[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x203525ebcd0>

[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
ROC AUC Score: 0.8289850486692223
True Positive Rate (TPR): 0.7727832512315271
True Negative Rate (TNR): 0.7393715341959335
Geometric Mean (G-Mean): 0.7558928085806715
              precision    recall  f1-score   support

           0       0.68      0.74      0.71      1082
           1       0.82      0.77      0.79      1624

    accuracy                           0.76      2706
   macro avg       0.75      0.76      0.75      2706
weighted avg       0.76      0.76      0.76      2706



In [None]:
# ROS-RUS 1:1 majority class divided by 2
ratio = 1

# Oversample the minority class
majority_undersampled = majority.sample(n=int(len(majority) // 2), random_state=707) 
minority_oversampled = minority.sample(n=int(len(majority_undersampled)), replace=True, random_state=251)
# Concatenate the oversampled minority class with the majority class
df_ROS_1 = pd.concat([majority_undersampled, minority_oversampled])
# Shuffle the DataFrame to ensure random mixing
df_ROS_1 = df_ROS_1.sample(frac=1, random_state=42).reset_index(drop=True)
# Verify the number of instances in each class
print(df_ROS_1['fraud'].value_counts())
# Check the shape of the DataFrame
print(df_ROS_1.shape)
