**Load Libraries**

In [197]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers, models, Input
from tensorflow.keras.optimizers import Adam

**Load Data**

In [198]:
# Load the dataset
database = pd.read_csv("BANKACCOUNTDATA.csv")

# Clean column names by stripping extra spaces
database.columns = database.columns.str.strip()

# Cleaning specific columns:

# 1. Clean and convert numeric columns to numeric, filling NaNs with 0
numeric_columns = ['WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']
for col in numeric_columns:
    # Remove commas, strip spaces, convert to numeric, and fill NaNs with 0
    database[col] = pd.to_numeric(database[col].str.replace(",", "").str.strip(), errors='coerce').fillna(0)

# 2. Clean string columns by stripping spaces and filling NaNs with empty strings
string_columns = ['Account No', 'DATE', 'TRANSACTION DETAILS', 'CHQ.NO.', 'VALUE DATE']
for col in string_columns:
    # Strip spaces and fill NaNs with empty strings
    database[col] = database[col].astype(str).str.strip().fillna('')

# Display the cleaned DataFrame information to ensure all NaNs are handled
print("DataFrame after cleaning and filling NaN values:")
print(database.info())

DataFrame after cleaning and filling NaN values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116201 entries, 0 to 116200
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Account No           116201 non-null  object 
 1   DATE                 116201 non-null  object 
 2   TRANSACTION DETAILS  116201 non-null  object 
 3   CHQ.NO.              116201 non-null  object 
 4   VALUE DATE           116201 non-null  object 
 5   WITHDRAWAL AMT       116201 non-null  float64
 6   DEPOSIT AMT          116201 non-null  float64
 7   BALANCE AMT          116201 non-null  float64
 8   .                    116201 non-null  object 
dtypes: float64(3), object(6)
memory usage: 8.0+ MB
None


**GAN**
1) Generator (Nueral Net that creates what it thinks is replicated data)
2) Discriminator (Nueral Net that is fed both the real and fake data and chooses which one is the most realistis)
3) Real/Fake (Sends Back Propogration to the Nueral Nets to edit them depending on the outcome of the Discriminator)

**Helper Functions**

In [199]:
# Generate Noise for Generator
def latent_dim():
    #Generate function to optimize randomness
    return 10

#alpha function for LeakyReLU paramater
def alpha():
    #Generate function to optimize dying nuerons
    return 0.2

#input shape function for discriminator
def inputShape(dataFrame):
    return 3

#drop paramater for discriminator
def dropout():
    return 0.3

#Define batch size
def batch():
    return 32

#Data values
def meanIncome(dataFrame):
    return dataFrame["DEPOSIT AMT"].mean()

def stddevIncome(dataFrame):
    return dataFrame["DEPOSIT AMT"].std()

def meanExpense(dataFrame):
    return dataFrame["WITHDRAWAL AMT"].mean()

def stddevExpense(dataFrame):
    return dataFrame["WITHDRAWAL AMT "].std()

**Generator**

In [200]:
# Generator Function
def generator(noise):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(noise,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=alpha()))
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=alpha()))
    model.add(layers.Dense(3, activation='tanh'))
    return model


**Discriminator**

In [201]:
# Discriminator Function
def discriminator(inputShape):
    model = models.Sequential()
    model.add(layers.Dense(256, activation='relu', input_shape=(inputShape,)))
    model.add(layers.LeakyReLU(alpha=alpha()))
    model.add(layers.Dropout(dropout()))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.LeakyReLU(alpha=alpha()))
    model.add(layers.Dropout(dropout()))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model
    

**Defining GAN Model**

In [202]:
# Initialize models
generator_model = generator(latent_dim())
discriminator_model = discriminator(inputShape(database))

# Compile the discriminator
discriminator_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Freeze the discriminator weights before compiling the GAN
discriminator_model.trainable = False

# Define the GAN input
gan_input = Input(shape=(latent_dim(),))

# Pass the input through the generator to create fake data
generated_data = generator_model(gan_input)

# Pass the generated data through the frozen discriminator
gan_output = discriminator_model(generated_data)

# Create the GAN model with the generator and discriminator
gan = models.Model(gan_input, gan_output)

# Compile the GAN with binary crossentropy loss
gan.compile(optimizer=Adam(), loss='binary_crossentropy')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


**Generate Real Data**

In [203]:
def generateRealData(batch_size):
    # Randomly sample data points from the cleaned DataFrame
    sampled_data = database[['WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']].sample(n=batch_size, replace=True).values
    return sampled_data

**Train GAN**

In [204]:
# Function to train the GAN
def train_gan(generator, discriminator, gan, latent_dim, epochs, batch_size):
    # Training loop
    for epoch in range(epochs):
        # Generate 'Real Data'
        real_data = generateRealData(batch_size)
        real_labels = np.ones((batch_size, 1))

        # Generate Fake Data
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        fake_data = generator.predict(noise)
        fake_labels = np.zeros((batch_size, 1))

        # Train discriminator
        discriminator_loss_real = discriminator.train_on_batch(real_data, real_labels)
        discriminator_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
        discriminator_loss = 0.5 * np.add(discriminator_loss_real, discriminator_loss_fake)

        # Train generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        misleading_labels = np.ones((batch_size, 1))  # Labels for generator training
        generator_loss = gan.train_on_batch(noise, misleading_labels)

        # Print progress
        if epoch % 10 == 0:  # Adjust frequency of output
            print(f"Epoch {epoch}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}")

# Run training
train_gan(generator_model, discriminator_model, gan, latent_dim(), 100, batch())

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step




Epoch 0, Discriminator Loss: [7.910794e+07 1.406250e-01], Generator Loss: 0.6716089248657227
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Epoch 10, Discriminator Loss: [3.9832216e+07 1.9335091e-01], Generator Loss: 0.6538519263267517
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━

In [205]:
# Function to generate new synthetic samples using the trained generator
def generate_samples(generator: generator_model, num_samples: batch(), latent_dim: latent_dim()):
    """
    Generate new synthetic samples using the trained generator model.

    :param generator: The trained generator model.
    :param num_samples: The number of new samples to generate.
    :param latent_dim: The size of the latent dimension used during training.
    :return: Generated synthetic data samples as a NumPy array.
    """
    # Step 1: Generate random noise as input for the generator
    # The noise vector shape should be (num_samples, latent_dim)
    noise = np.random.normal(0, 1, (num_samples, latent_dim))

    # Step 2: Use the generator to produce synthetic data
    # This step uses the trained generator to make predictions based on the noise
    generated_data = generator.predict(noise)

    # Step 3: (Optional) Post-process the data if needed, such as scaling or formatting
    # For example, you might scale back if your data was normalized during training
    # If no post-processing is needed, this step can be skipped.

    # Return the generated samples
    return generated_data

# Example usage of the generate_samples function:
# Assuming `generator_model` is your trained generator model

num_samples = 10  # Number of new samples you want to generate
latent_dim = latent_dim()  # Latent dimension size used during training

# Generate new samples
new_samples = generate_samples(generator_model, num_samples, latent_dim)

# Display the generated synthetic data
print("Generated Synthetic Data:")
print(new_samples)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
Generated Synthetic Data:
[[ 0.7954468  -0.827925    0.41015968]
 [ 0.63151795 -0.7489342   0.19100927]
 [ 0.85412574 -0.8739625   0.40855506]
 [ 0.82052225 -0.76475745  0.36820918]
 [ 0.32032862 -0.42092893  0.2122273 ]
 [-0.48147777  0.2903799   0.3375616 ]
 [-0.39695725  0.20298928  0.24062414]
 [ 0.4267511  -0.56318694  0.37383512]
 [-0.18938375  0.2304606   0.3345723 ]
 [ 0.38217008 -0.55908465  0.31596196]]
