In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Load preprocessed data
df = pd.read_csv('preprocessed_data_with_fraud.csv')

# Define features and target
X = df.drop('fraud', axis=1)
y = df['fraud']

# Define GAN components
def build_generator(latent_dim, input_dim):
    model = Sequential([
        Dense(128, input_dim=latent_dim),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(256),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(512),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(1024),
        LeakyReLU(alpha=0.2),
        BatchNormalization(),
        Dense(input_dim, activation='tanh')  # Output layer with same shape as input
    ])
    return model

def build_discriminator(input_dim):
    model = Sequential([
        Dense(1024, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        Dense(512),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(128),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    return model

def build_gan(generator, discriminator):
    discriminator.compile(optimizer=Adam(learning_rate=0.0002, beta_1=0.5), loss='binary_crossentropy', metrics=['accuracy'])
    gan = Sequential([generator, discriminator])
    gan.compile(optimizer=Adam(learning_rate=0.0002, beta_1=0.5), loss='binary_crossentropy')
    return gan

# Hyperparameters
latent_dim = 100
input_dim = X.shape[1]
batch_size = 64
epochs = 10

# Build GAN
generator = build_generator(latent_dim, input_dim)
discriminator = build_discriminator(input_dim)
gan = build_gan(generator, discriminator)

# Training GAN
def train_gan(generator, discriminator, gan, X_train, epochs, batch_size):
    for epoch in range(epochs):
        # Generate fake data
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        generated_data = generator.predict(noise)
        
        # Train discriminator
        real_data = X_train[np.random.randint(0, X_train.shape[0], batch_size)]
        labels_real = np.ones((batch_size, 1))
        labels_fake = np.zeros((batch_size, 1))
        
        d_loss_real = discriminator.train_on_batch(real_data, labels_real)
        d_loss_fake = discriminator.train_on_batch(generated_data, labels_fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # Train generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        labels_gan = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, labels_gan)
        
        print(f"Epoch {epoch+1}/{epochs} | D Loss: {d_loss[0]} | D Accuracy: {100*d_loss[1]} | G Loss: {g_loss}")

# Prepare data
X_scaled = StandardScaler().fit_transform(X)  # Ensure that the data is scaled
X_train, _ = train_test_split(X_scaled, test_size=0.2, random_state=42)

# Train GAN
train_gan(generator, discriminator, gan, X_train, epochs, batch_size)

# Generate synthetic data
def generate_synthetic_data(generator, latent_dim, num_samples):
    noise = np.random.normal(0, 1, (num_samples, latent_dim))
    synthetic_data = generator.predict(noise)
    return synthetic_data

synthetic_data = generate_synthetic_data(generator, latent_dim, 1000)

# Prepare synthetic data DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=X.columns)

# Combine real and synthetic data
combined_df = pd.concat([X, synthetic_df], axis=0, ignore_index=True)
combined_df['fraud'] = pd.concat([y.reset_index(drop=True), pd.Series([0]*synthetic_data.shape[0])], axis=0).reset_index(drop=True)

# Save combined dataset
combined_df.to_csv('combined_data.csv', index=False)

# Load combined dataset
df_combined = pd.read_csv('combined_data.csv')

# Define features and target
X_combined = df_combined.drop('fraud', axis=1)
y_combined = df_combined['fraud']

# Split the dataset into training and testing sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Initialize models
log_reg = LogisticRegression()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

# Fit models
log_reg.fit(X_train_combined, y_train_combined)
rf.fit(X_train_combined, y_train_combined)
gb.fit(X_train_combined, y_train_combined)

# Predict and evaluate
log_reg_preds = log_reg.predict(X_test_combined)
rf_preds = rf.predict(X_test_combined)
gb_preds = gb.predict(X_test_combined)

# Calculate accuracy
log_reg_acc = accuracy_score(y_test_combined, log_reg_preds)
rf_acc = accuracy_score(y_test_combined, rf_preds)
gb_acc = accuracy_score(y_test_combined, gb_preds)

print(f"Logistic Regression Accuracy: {log_reg_acc}")
print(f"Random Forest Accuracy: {rf_acc}")
print(f"Gradient Boosting Accuracy: {gb_acc}")

# Assign weights based on accuracy
model_weights = {
    'log_reg': log_reg_acc,
    'rf': rf_acc,
    'gb': gb_acc
}

# Create Voting Classifier with weighted voting
voting_clf = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('rf', rf),
        ('gb', gb)
    ],
    voting='soft',
    weights=[model_weights['log_reg'], model_weights['rf'], model_weights['gb']]
)

# Fit the voting classifier
voting_clf.fit(X_train_combined, y_train_combined)

# Predict and evaluate with the ensemble
voting_preds = voting_clf.predict(X_test_combined)
ensemble_acc = accuracy_score(y_test_combined, voting_preds)

print(f"Ensemble Accuracy: {ensemble_acc}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step  
Epoch 1/10 | D Loss: 0.6874889135360718 | D Accuracy: 49.21875 | G Loss: [array(0.69501406, dtype=float32), array(0.69501406, dtype=float32), array(0.34375, dtype=float32)]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Epoch 2/10 | D Loss: 0.6296324133872986 | D Accuracy: 49.21875 | G Loss: [array(0.6440164, dtype=float32), array(0.6440164, dtype=float32), array(0.421875, dtype=float32)]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
Epoch 3/10 | D Loss: 0.6048415899276733 | D Accuracy: 49.270832538604736 | G Loss: [array(0.61787885, dtype=float32), array(0.61787885, dtype=float32), array(0.44791666, dtype=float32)]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step  
Epoch 4/10 | D Loss: 0.5910701155662537 | D Accuracy: 49.38616156578064 | G Loss: [array(0.6018941, dtype=float32), array(0.6018941, dtype=float32), array(0.4609375, dtype=float32)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.995317160372684
Random Forest Accuracy: 0.9966773212602862
Gradient Boosting Accuracy: 0.996036102556131


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Ensemble Accuracy: 0.9962692729940056
