In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load and preprocess the data
df = pd.read_csv('combined_output.csv')
df2 = df.drop(' Label', axis=1)
df2 = df2.dropna()

# Replace infinite values with NaN and drop rows with NaNs
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2_cleaned = df2.dropna()

# Select only numeric columns, keeping 'Label Num' for later comparison
numeric_df = df2_cleaned.select_dtypes(include=[np.number])

# Extract the 'Label Num' for the ground truth comparison later
labels = numeric_df['Label Num']

# Separate majority (Label Num = 0) and minority (Label Num = 1) classes
majority_data = numeric_df[numeric_df['Label Num'] == 0].drop('Label Num', axis=1)
minority_data = numeric_df[numeric_df['Label Num'] == 1].drop('Label Num', axis=1)

# Convert to NumPy array and scale the data
scaler = StandardScaler()
X_majority_scaled = scaler.fit_transform(majority_data)
X_minority_scaled = scaler.transform(minority_data)

# Split the data into training and testing sets
X_train_majority, X_test_majority = train_test_split(X_majority_scaled, test_size=0.2, random_state=42)
X_train_minority, X_test_minority = train_test_split(X_minority_scaled, test_size=0.2, random_state=42)

# Define GAN architecture for minority class data generation
latent_dim = 100

# Generator
generator = Sequential([
    Dense(128, input_dim=latent_dim, activation='relu'),
    Dense(X_train_minority.shape[1], activation='tanh')
])

# Discriminator
discriminator = Sequential([
    Dense(128, input_dim=X_train_minority.shape[1], activation='relu'),
    Dense(1, activation='sigmoid')
])
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# GAN combining generator and discriminator
discriminator.trainable = False
gan = Sequential([generator, discriminator])
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Training the GAN
epochs = 1000
batch_size = 32

for epoch in range(epochs):
    # Generate noise and create synthetic minority samples
    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    synthetic_samples = generator.predict(noise, verbose=0)

    # Sample real minority data
    idx = np.random.randint(0, X_train_minority.shape[0], batch_size)
    real_samples = X_train_minority[idx]

    # Train discriminator
    discriminator_loss_real = discriminator.train_on_batch(real_samples, np.ones((batch_size, 1)))
    discriminator_loss_fake = discriminator.train_on_batch(synthetic_samples, np.zeros((batch_size, 1)))

    # Train generator
    gan_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    if epoch % 100 == 0:
        print(f"Epoch {epoch}/{epochs} | Discriminator Loss Real: {discriminator_loss_real[0]}, Fake: {discriminator_loss_fake[0]} | GAN Loss: {gan_loss}")

# Generate synthetic minority samples
num_synthetic_samples = X_train_majority.shape[0] - X_train_minority.shape[0]  # Generate enough to balance classes
noise = np.random.normal(0, 1, size=(num_synthetic_samples, latent_dim))
synthetic_minority_samples = generator.predict(noise, verbose=0)

# Combine majority class, real minority, and synthetic minority samples for clustering
X_train_combined = np.vstack([X_train_majority, X_train_minority, synthetic_minority_samples])
y_train_combined = np.hstack([np.zeros(len(X_train_majority)), np.ones(len(X_train_minority) + num_synthetic_samples)])

# Define the pipeline with KMeans
pipeline = Pipeline([
    ('kmeans', KMeans())
])

# Setup the grid search parameters for KMeans
param_grid = {
    'kmeans__n_clusters': [5, 7, 9, 11],
    'kmeans__n_init': [10, 20],
    'kmeans__algorithm': ['lloyd', 'full', 'elkan']
}

# Davies-Bouldin score as the scoring metric
def davies_bouldin_scorer(estimator, X):
    labels = estimator.predict(X)
    return -davies_bouldin_score(X, labels)  # Minimize Davies-Bouldin score

# Initialize the grid search for KMeans clustering
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=davies_bouldin_scorer,
    cv=5,
)

# Perform grid search on the combined dataset
grid_search.fit(X_train_combined)

# Make predictions on the test set (unmodified test set)
X_test_combined = np.vstack([X_test_majority, X_test_minority])
y_pred = grid_search.best_estimator_.predict(X_test_combined)

# Calculate the Davies-Bouldin score for the test set
db_score_test = davies_bouldin_score(X_test_combined, y_pred)

# Print the Davies-Bouldin score for the test set
print("Davies-Bouldin Score for the test data:", db_score_test)

# Create a DataFrame to examine the distribution between clusters and actual labels
data_with_predictions = pd.DataFrame({
    'Prediction': y_pred,  # Predicted clusters
    'Actual': np.hstack([np.zeros(len(X_test_majority)), np.ones(len(X_test_minority))])  # Original labels (0 or 1)
})

# Group by cluster prediction and actual label, then count occurrences
distribution = pd.crosstab(data_with_predictions['Prediction'], data_with_predictions['Actual'])

# Print the distribution matrix
print(distribution)

Epoch 0/1000 | Discriminator Loss Real: 1.8549137115478516, Fake: 0.6435272693634033 | GAN Loss: 0.8153675198554993
Epoch 100/1000 | Discriminator Loss Real: 0.09871930629014969, Fake: 0.1899869441986084 | GAN Loss: 2.347548484802246
Epoch 200/1000 | Discriminator Loss Real: 0.08501910418272018, Fake: 0.05190687254071236 | GAN Loss: 3.434967041015625
Epoch 300/1000 | Discriminator Loss Real: 0.05429769307374954, Fake: 0.037350237369537354 | GAN Loss: 3.7918872833251953
Epoch 400/1000 | Discriminator Loss Real: 0.045758139342069626, Fake: 0.04472103714942932 | GAN Loss: 4.052107810974121
Epoch 500/1000 | Discriminator Loss Real: 0.05166526883840561, Fake: 0.048178575932979584 | GAN Loss: 3.7708287239074707
Epoch 600/1000 | Discriminator Loss Real: 0.05447565019130707, Fake: 0.21418020129203796 | GAN Loss: 2.0289878845214844
Epoch 700/1000 | Discriminator Loss Real: 0.0959012359380722, Fake: 0.20360030233860016 | GAN Loss: 1.8239518404006958
Epoch 800/1000 | Discriminator Loss Real: 0.20

Traceback (most recent call last):
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
  File "C:\Users\zhong.DESKTOP-V7DRVU6\AppData\Local\Temp\ipykernel_20940\2620978676.py", line 107, in davies_bouldin_scorer
    return -davies_bouldin_score(X, labels)  # Minimize Davies-Bouldin score
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\utils\_param_validation.py", line 214, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 407, in davies_bouldin_score
    check_number_of_labels(n_labels, n_samples)
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 37, in check_number_of_labels
    raise ValueError(
ValueError: Number of labels is 1. Valid values

Davies-Bouldin Score for the test data: 1.0783099521971984
Actual         0.0    1.0
Prediction               
0           171566  60262
1             4058      9
2              109  15684
3            76275   2087
4              104  22991
5           194159    161
6               13      0
7              572   9351
8             7408    767
