In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_mlg_ulb_creditcardfraud_path = kagglehub.dataset_download('organizations/mlg-ulb/creditcardfraud')
kartik2112_fraud_detection_on_paysim_dataset_path = kagglehub.notebook_output_download('kartik2112/fraud-detection-on-paysim-dataset')

print('Data source import complete.')


In [None]:



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, accuracy_score)


#Load Dataset



DATA_PATH = '../input/creditcardfraud/creditcard.csv'
df = pd.read_csv(DATA_PATH)

print("Dataset Loaded.")
print("Shape of Data:", df.shape)
print(df.head())


print("\n--- Checking for Missing Values ---")
print(df.isnull().sum())

print("\n--- Class Distribution (Fraud=1, Legit=0) ---")
print(df['Class'].value_counts())


X = df.drop('Class', axis=1)
y = df['Class'].values


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("\n--- Data Split ---")
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


modelA = Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(8, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # binary classification => use sigmoid
])

modelA.compile(optimizer=Adam(learning_rate=0.001),
               loss='binary_crossentropy',
               metrics=['accuracy'])

EPOCHS = 500
BATCH_SIZE = 256

print("\n--- Training Model A (Neural Net) ---")
historyA = modelA.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,  # 10% of train set used for validation
    verbose=1
)


plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(historyA.history['loss'], label='Train Loss')
plt.plot(historyA.history['val_loss'], label='Val Loss')
plt.title('Model A - Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(historyA.history['accuracy'], label='Train Accuracy')
plt.plot(historyA.history['val_accuracy'], label='Val Accuracy')
plt.title('Model A - Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


pred_probs_A = modelA.predict(X_test)
pred_A = (pred_probs_A > 0.5).astype("int32")  # convert probabilities to 0/1

accA = accuracy_score(y_test, pred_A)
aucA = roc_auc_score(y_test, pred_probs_A)
print(f"\nModel A (Neural Net) - Accuracy: {accA:.4f}")
print(f"Model A (Neural Net) - AUC: {aucA:.4f}")

cmA = confusion_matrix(y_test, pred_A)
print("\nConfusion Matrix (Model A):")
print(cmA)
print("\nClassification Report (Model A):")
print(classification_report(y_test, pred_A, digits=4))

modelB = LogisticRegression(max_iter=1000)
modelB.fit(X_train, y_train)

pred_probs_B = modelB.predict_proba(X_test)[:,1]
pred_B = modelB.predict(X_test)

accB = accuracy_score(y_test, pred_B)
aucB = roc_auc_score(y_test, pred_probs_B)
print(f"\nModel B (Logistic Reg) - Accuracy: {accB:.4f}")
print(f"Model B (Logistic Reg) - AUC: {aucB:.4f}")

cmB = confusion_matrix(y_test, pred_B)
print("\nConfusion Matrix (Model B):")
print(cmB)
print("\nClassification Report (Model B):")
print(classification_report(y_test, pred_B, digits=4))


ensemble_prob = (pred_probs_A.flatten() + pred_probs_B) / 2.0
ensemble_pred = (ensemble_prob > 0.5).astype("int32")

accE = accuracy_score(y_test, ensemble_pred)
aucE = roc_auc_score(y_test, ensemble_prob)
print(f"\nEnsemble Model - Accuracy: {accE:.4f}")
print(f"Ensemble Model - AUC: {aucE:.4f}")

cmE = confusion_matrix(y_test, ensemble_pred)
print("\nConfusion Matrix (Ensemble):")
print(cmE)
print("\nClassification Report (Ensemble):")
print(classification_report(y_test, ensemble_pred, digits=4))


plt.figure(figsize=(5,4))
sns.heatmap(cmE, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Ensemble Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


print("\n--- Dummy Transaction Demo ---")
training_cols = df.drop('Class', axis=1).columns  # columns used for training

# Example values: fill them realistically or test extremes
dummy_tx = {
    "Time": 50000,    # if you didn't drop Time
    "V1": -1.2,
    "V2": 2.5,
    "V3": 1.1,
    "V4": 0.5,
    "V5": -0.7,
    "V6": 2.0,
    "V7": 0.1,
    "V8": -0.3,
    "V9": 0.8,
    "V10": -1.5,
    "V11": 2.2,
    "V12": -0.9,
    "V13": 0.6,
    "V14": -1.0,
    "V15": 0.2,
    "V16": -0.3,
    "V17": 0.1,
    "V18": 0.8,
    "V19": -0.2,
    "V20": 1.0,
    "V21": 0.0,
    "V22": -2.1,
    "V23": 1.3,
    "V24": 0.5,
    "V25": -0.2,
    "V26": 0.7,
    "V27": -1.1,
    "V28": 0.4,
    "Amount": 300.0
}

# Convert to DataFrame
dummy_df = pd.DataFrame([dummy_tx])
# Ensure columns are in the same order
dummy_df = dummy_df[training_cols]

# Scale using the SAME scaler
dummy_scaled = scaler.transform(dummy_df)

# Model A's probability
probA_dummy = modelA.predict(dummy_scaled)[0,0]
# Model B's probability
probB_dummy = modelB.predict_proba(dummy_scaled)[:,1][0]

# Ensemble
ensemble_prob_dummy = (probA_dummy + probB_dummy)/2.0

print("Neural Net Fraud Probability:", probA_dummy)
print("Logistic Reg Fraud Probability:", probB_dummy)
print("Ensemble Probability:", ensemble_prob_dummy)

if ensemble_prob_dummy > 0.5:
    print("Ensemble Decision: FRAUDULENT Transaction\n")
else:
    print("Ensemble Decision: LEGITIMATE Transaction\n")

print("\n--- Done! ---\n")
print("Observations:")
print("1) We trained two models (NN & Logistic) and compared their AUC & accuracy.")
print("2) We formed an ensemble by averaging predicted fraud probabilities.")



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, accuracy_score)


#Load Dataset
DATA_PATH = '../input/creditcardfraud/creditcard.csv'
df = pd.read_csv(DATA_PATH)

print("Dataset Loaded.")
print("Shape of Data:", df.shape)
print(df.head())

print("\n--- Checking for Missing Values ---")
print(df.isnull().sum())

print("\n--- Class Distribution (Fraud=1, Legit=0) ---")
print(df['Class'].value_counts())


X = df.drop('Class', axis=1)
y = df['Class'].values


# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



processed_df = pd.DataFrame(X_scaled, columns=X.columns)
print("\n--- SCALED DATA PREVIEW (first 5 rows) ---")
print(processed_df.head())




X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("\n--- Data Split ---")
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


# Model A: Neural Network
modelA = Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(8, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # binary classification => use sigmoid
])

modelA.compile(optimizer=Adam(learning_rate=0.001),
               loss='binary_crossentropy',
               metrics=['accuracy'])

EPOCHS = 500
BATCH_SIZE = 256

print("\n--- Training Model A (Neural Net) ---")
historyA = modelA.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    verbose=1
)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(historyA.history['loss'], label='Train Loss')
plt.plot(historyA.history['val_loss'], label='Val Loss')
plt.title('Model A - Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(historyA.history['accuracy'], label='Train Accuracy')
plt.plot(historyA.history['val_accuracy'], label='Val Accuracy')
plt.title('Model A - Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

pred_probs_A = modelA.predict(X_test)
pred_A = (pred_probs_A > 0.5).astype("int32")  # convert probabilities to 0/1

accA = accuracy_score(y_test, pred_A)
aucA = roc_auc_score(y_test, pred_probs_A)
print(f"\nModel A (Neural Net) - Accuracy: {accA:.4f}")
print(f"Model A (Neural Net) - AUC: {aucA:.4f}")

cmA = confusion_matrix(y_test, pred_A)
print("\nConfusion Matrix (Model A):")
print(cmA)
print("\nClassification Report (Model A):")
print(classification_report(y_test, pred_A, digits=4))


# Model B: Logistic Regression
modelB = LogisticRegression(max_iter=1000)
modelB.fit(X_train, y_train)

pred_probs_B = modelB.predict_proba(X_test)[:,1]
pred_B = modelB.predict(X_test)

accB = accuracy_score(y_test, pred_B)
aucB = roc_auc_score(y_test, pred_probs_B)
print(f"\nModel B (Logistic Reg) - Accuracy: {accB:.4f}")
print(f"Model B (Logistic Reg) - AUC: {aucB:.4f}")

cmB = confusion_matrix(y_test, pred_B)
print("\nConfusion Matrix (Model B):")
print(cmB)
print("\nClassification Report (Model B):")
print(classification_report(y_test, pred_B, digits=4))


# Ensemble
ensemble_prob = (pred_probs_A.flatten() + pred_probs_B) / 2.0
ensemble_pred = (ensemble_prob > 0.5).astype("int32")

accE = accuracy_score(y_test, ensemble_pred)
aucE = roc_auc_score(y_test, ensemble_prob)
print(f"\nEnsemble Model - Accuracy: {accE:.4f}")
print(f"Ensemble Model - AUC: {aucE:.4f}")

cmE = confusion_matrix(y_test, ensemble_pred)
print("\nConfusion Matrix (Ensemble):")
print(cmE)
print("\nClassification Report (Ensemble):")
print(classification_report(y_test, ensemble_pred, digits=4))

plt.figure(figsize=(5,4))
sns.heatmap(cmE, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Ensemble Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


print("\n--- Dummy Transaction Demo ---")
training_cols = df.drop('Class', axis=1).columns  # columns used for training

dummy_tx = {
    "Time": 50000,
    "V1": -1.2,
    "V2": 2.5,
    "V3": 1.1,
    "V4": 0.5,
    "V5": -0.7,
    "V6": 2.0,
    "V7": 0.1,
    "V8": -0.3,
    "V9": 0.8,
    "V10": -1.5,
    "V11": 2.2,
    "V12": -0.9,
    "V13": 0.6,
    "V14": -1.0,
    "V15": 0.2,
    "V16": -0.3,
    "V17": 0.1,
    "V18": 0.8,
    "V19": -0.2,
    "V20": 1.0,
    "V21": 0.0,
    "V22": -2.1,
    "V23": 1.3,
    "V24": 0.5,
    "V25": -0.2,
    "V26": 0.7,
    "V27": -1.1,
    "V28": 0.4,
    "Amount": 300.0
}

dummy_df = pd.DataFrame([dummy_tx])
dummy_df = dummy_df[training_cols]

dummy_scaled = scaler.transform(dummy_df)

probA_dummy = modelA.predict(dummy_scaled)[0,0]
probB_dummy = modelB.predict_proba(dummy_scaled)[:,1][0]

ensemble_prob_dummy = (probA_dummy + probB_dummy)/2.0

print("Neural Net Fraud Probability:", probA_dummy)
print("Logistic Reg Fraud Probability:", probB_dummy)
print("Ensemble Probability:", ensemble_prob_dummy)

if ensemble_prob_dummy > 0.5:
    print("Ensemble Decision: FRAUDULENT Transaction\n")
else:
    print("Ensemble Decision: LEGITIMATE Transaction\n")

print("\n--- Done! ---\n")
print("Observations:")
print("1) We trained two models (NN & Logistic) and compared their AUC & accuracy.")
print("2) We formed an ensemble by averaging predicted fraud probabilities.")

