In [None]:
!pip install tensorflow numpy sklearn

Collecting sklearn
  Downloading sklearn-0.0.post11.tar.gz (3.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('creditcard.csv')  # replace with the correct path to your CSV file

# Preprocess the data
# Assuming 'Time' column is not required and 'Amount' needs to be scaled
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
X = df.drop(['Class', 'Time'], axis=1)  # dropping 'Time' if not required
y = df['Class']

# Define the dimension of the latent space
latent_dim = 100  # for generating synthetic samples

# Build the Generator
def build_generator(latent_dim):
    model = Sequential()

    model.add(Dense(128, input_dim=latent_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(X.shape[1], activation='tanh'))  # Output layer size must match the number of input features

    return model

# Build the Discriminator
def build_discriminator(data_shape):
    model = Sequential()

    model.add(Dense(512, input_dim=data_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5), metrics=['accuracy'])
    return model

# GAN Utility Functions
def get_real_samples(batch_size):
    idx = np.random.randint(0, X.shape[0], batch_size)
    real_samples = X.iloc[idx].values
    real_labels = np.ones((batch_size, 1))  # Label for real samples is 1
    return real_samples, real_labels

def generate_latent_points(batch_size):
    return np.random.normal(0, 1, (batch_size, latent_dim))

def generate_fake_samples(generator, batch_size):
    latent_points = generate_latent_points(batch_size)
    fake_samples = generator.predict(latent_points)
    fake_labels = np.zeros((batch_size, 1))  # Label for fake samples is 0
    return fake_samples, fake_labels

# Assemble the GAN
generator = build_generator(latent_dim)
discriminator = build_discriminator(X.shape[1])
discriminator.trainable = False  # Make sure only the generator is trained within the GAN model
gan_input = Input(shape=(latent_dim,))
fake_samples = generator(gan_input)
gan_output = discriminator(fake_samples)
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam(0.0002, 0.5))

# Training the GAN
def train_gan(epochs, batch_size):
    for epoch in range(epochs):
        # Get randomly selected 'real' samples
        real_samples, real_labels = get_real_samples(batch_size // 2)
        # Generate 'fake' samples
        fake_samples, fake_labels = generate_fake_samples(generator, batch_size // 2)
        # Train the discriminator
        d_loss_real = discriminator.train_on_batch(real_samples, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_samples, fake_labels)
        # Prepare points in the latent space as input for the generator
        latent_points = generate_latent_points(batch_size)
        # Labels for fake samples are 1 to fool the discriminator
        fake_labels = np.ones((batch_size, 1))
        # Train the GAN on the latent points
        g_loss = gan.train_on_batch(latent_points, fake_labels)

        # Output training progress
        print(f"Epoch {epoch+1}/{epochs} | D Loss Real: {d_loss_real[0]}, D Loss Fake: {d_loss_fake[0]}, G Loss: {g_loss}")

# Train GAN
train_gan(epochs=10000, batch_size=32)  # Adjust the number of epochs and batch size according to your requirements

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 7501/10000 | D Loss Real: 0.001584784360602498, D Loss Fake: 0.00426780479028821, G Loss: 6.901890754699707
Epoch 7502/10000 | D Loss Real: 0.27301472425460815, D Loss Fake: 0.03326919674873352, G Loss: 4.661462783813477
Epoch 7503/10000 | D Loss Real: 2.680545367184095e-05, D Loss Fake: 0.02973797544836998, G Loss: 5.7363786697387695
Epoch 7504/10000 | D Loss Real: 0.00010301560541847721, D Loss Fake: 0.025455862283706665, G Loss: 6.195477485656738
Epoch 7505/10000 | D Loss Real: 0.10097759962081909, D Loss Fake: 0.005876133684068918, G Loss: 5.477634906768799
Epoch 7506/10000 | D Loss Real: 2.1216308596194722e-05, D Loss Fake: 0.03923577070236206, G Loss: 5.9858903884887695
Epoch 7507/10000 | D Loss Real: 1.2204993709019618e-06, D Loss Fake: 0.009657837450504303, G Loss: 6.693577766418457
Epoch 7508/10000 | D Loss Real: 0.0006393740768544376, D Loss Fake: 0.010472842492163181, G Loss: 5.8941755294799805
Epoch 7509

In [None]:
# Assuming 'generator' is your pre-trained generator model from the GAN

# Number of synthetic samples to create
# This should be based on how much you want to balance the classes
num_synthetic_samples = 10000  # example number

# Generate synthetic transaction data
latent_points = generate_latent_points(num_synthetic_samples)
synthetic_data = generator.predict(latent_points)

# Convert synthetic data to a DataFrame
synthetic_data_df = pd.DataFrame(synthetic_data, columns=X.columns)

# Add a 'Class' column to the synthetic data, and set it to '1' for fraudulent transactions
synthetic_data_df['Class'] = 1  # Assuming you want to generate data for the minority class



In [None]:
# Concatenate the synthetic data with the original data
augmented_data = pd.concat([df, synthetic_data_df])

# Shuffle the augmented dataset
augmented_data = augmented_data.sample(frac=1).reset_index(drop=True)


In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer object with a median filling strategy
imputer = SimpleImputer(strategy='median')

# Train on the training data
imputer.fit(X_train_aug)

# Transform both training and testing data
X_train_aug = imputer.transform(X_train_aug)
X_test_aug = imputer.transform(X_test_aug)

# Now, create and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)  # Increasing max_iter for convergence
lr_model.fit(X_train_aug, y_train_aug)

# Continue with prediction and AUPRC calculation...


In [None]:
# Check if any NaN values are present in the synthetic data
if np.isnan(synthetic_data).any():
    synthetic_data = np.nan_to_num(synthetic_data)  # Replace NaNs with 0 or use other strategies

# Convert synthetic data to a DataFrame as before
synthetic_data_df = pd.DataFrame(synthetic_data, columns=X.columns)
synthetic_data_df['Class'] = 1  # Assuming you want to generate data for the minority class

# Proceed with concatenation and model training...


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, average_precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Assuming generator is a pre-trained generator model from GAN
def generate_synthetic_data(generator, num_samples):
    latent_points = generate_latent_points(num_samples)
    synthetic_data = generator.predict(latent_points)
    return synthetic_data

# Assuming this is the correct path to your CSV file
df = pd.read_csv('creditcard.csv')

# Preprocess the data
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df = df.drop(['Time'], axis=1)  # Drop 'Time' if it's not needed

# Split the real data into features and target
X_real = df.drop('Class', axis=1)
y_real = df['Class']

# Generate synthetic data (equal to the number of real instances of the minority class)
num_fraud = y_real.sum()
synthetic_fraud = generate_synthetic_data(generator, num_fraud)

# Create DataFrame for synthetic data
synthetic_df = pd.DataFrame(synthetic_fraud, columns=X_real.columns)
synthetic_df['Class'] = 1  # All synthetic instances are considered fraudulent

# Concatenate the synthetic data with the original data
augmented_df = pd.concat([df, synthetic_df])

# Handling NaN values
imputer = SimpleImputer(strategy='mean')
X_augmented = imputer.fit_transform(augmented_df.drop('Class', axis=1))
y_augmented = augmented_df['Class'].values

# Split the augmented data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

# Retrain Logistic Regression model
lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)

# Retrain Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

# Retrain XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Make predictions with Logistic Regression
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]

# Make predictions with Random Forest
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Make predictions with XGBoost
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate models
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# Calculate Average Precision Score
auprc_lr = average_precision_score(y_test, y_pred_proba_lr)
auprc_rf = average_precision_score(y_test, y_pred_proba_rf)
auprc_xgb = average_precision_score(y_test, y_pred_proba_xgb)

print(f"AUPRC (Logistic Regression): {auprc_lr}")
print(f"AUPRC (Random Forest): {auprc_rf}")
print(f"AUPRC (XGBoost): {auprc_xgb}")

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.87      0.29      0.44       199

    accuracy                           1.00     57060
   macro avg       0.93      0.65      0.72     57060
weighted avg       1.00      1.00      1.00     57060

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.97      0.87      0.92       199

    accuracy                           1.00     57060
   macro avg       0.98      0.93      0.96     57060
weighted avg       1.00      1.00      1.00     57060

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.98      0.89      0.93       199

    accuracy                           1.00     57060
   macro avg     

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, average_precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import tensorflow as tf

# Setup for GPU usage, if available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

# Define the generator architecture
def build_generator(latent_dim, output_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=latent_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(output_dim, activation='tanh'))
    return model

# Function to generate latent points
def generate_latent_points(latent_dim, num_samples):
    return np.random.normal(0, 1, (num_samples, latent_dim))

# Function to generate synthetic data
def generate_synthetic_data(generator, latent_dim, num_samples):
    latent_points = generate_latent_points(latent_dim, num_samples)
    synthetic_data = generator.predict(latent_points)
    return synthetic_data

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load real dataset
df = pd.read_csv('creditcard.csv')  # Replace with your actual path

# Preprocess the data
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df = df.drop('Time', axis=1)

# Define latent space dimension and instantiate generator
latent_dim = 100
output_dim = df.shape[1] - 1  # Assuming 'Class' is the last column
generator = build_generator(latent_dim, output_dim)

# Assume generator is pre-trained here and weights are loaded
# generator.load_weights('path_to_generator_weights.h5')

# Determine how many synthetic samples to create
required_synthetic_rows = max(200000 - df.shape[0], 0)
if required_synthetic_rows > 0:
    # Generate synthetic data
    synthetic_data = generate_synthetic_data(generator, latent_dim, required_synthetic_rows)
    synthetic_df = pd.DataFrame(synthetic_data, columns=df.columns[:-1])  # Exclude 'Class' column
    synthetic_df['Class'] = 1  # Assuming you're generating fraudulent transactions
    df = pd.concat([df, synthetic_df])

# Handle NaN values in the dataset
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(df.drop('Class', axis=1))
y = df['Class'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate the models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]
    print(classification_report(y_test, predictions))
    auprc_score = average_precision_score(y_test, proba)
    print(f'AUPRC: {auprc_score}\n')
    return auprc_score

print("Logistic Regression Results:")
evaluate_model(lr_model, X_test, y_test)

print("Random Forest Results:")
evaluate_model(rf_model, X_test, y_test)

print("XGBoost Results:")
evaluate_model(xgb_model, X_test, y_test)


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, average_precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('creditcard.csv')  # replace with the correct path to your CSV file

# Preprocess the data
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df.drop(['Time'], axis=1, inplace=True)  # Drop 'Time' if it's not needed

# Assuming the generator has been trained already and is available as 'generator'
def generate_synthetic_data(generator, num_samples, latent_dim):
    # Sample random points in the latent space
    random_latent_vectors = np.random.normal(0, 1, size=(num_samples, latent_dim))
    # Generate the synthetic samples
    synthetic_data = generator.predict(random_latent_vectors)
    return synthetic_data

# Define the dimension of the latent space
latent_dim = 100  # This should match the latent space dimensionality of your GAN generator

# Determine the number of synthetic samples to create
target_row_count = 200000
current_row_count = df.shape[0]
num_synthetic_samples = target_row_count - current_row_count

# Generate synthetic data
if num_synthetic_samples > 0:
    synthetic_samples = generate_synthetic_data(generator, num_synthetic_samples, latent_dim)
    synthetic_df = pd.DataFrame(synthetic_samples, columns=df.columns[:-1])
    synthetic_df['Class'] = 1  # Assuming the synthetic samples are fraudulent
    df = pd.concat([df, synthetic_df])

# Impute any NaN values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df.drop('Class', axis=1))
y = df['Class'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
auprc_lr = average_precision_score(y_test, lr_model.predict_proba(X_test)[:, 1])

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
auprc_rf = average_precision_score(y_test, rf_model.predict_proba(X_test)[:, 1])

# Train XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
auprc_xgb = average_precision_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

# Output the classification report for each model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
print(f"Logistic Regression AUPRC: {auprc_lr}")

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print(f"Random Forest AUPRC: {auprc_rf}")

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print(f"XGBoost AUPRC: {auprc_xgb}")


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.86      0.58      0.70        98

    accuracy                           1.00     56962
   macro avg       0.93      0.79      0.85     56962
weighted avg       1.00      1.00      1.00     56962

Logistic Regression AUPRC: 0.7608664845090544

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.80      0.88        98

    accuracy                           1.00     56962
   macro avg       0.99      0.90      0.94     56962
weighted avg       1.00      1.00      1.00     56962

Random Forest AUPRC: 0.8674181721484885

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.80     