In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load and preprocess the data
df = pd.read_csv('FridayAfternoon.csv')
df2 = df.drop(' Label', axis=1)
df2 = df2.dropna()

# Replace infinite values with NaN and drop rows with NaNs
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2_cleaned = df2.dropna()

# Select only numeric columns, keeping 'Label Num' for later comparison
numeric_df = df2_cleaned.select_dtypes(include=[np.number])

# Extract the 'Label Num' for the ground truth comparison later
labels = numeric_df['Label Num']

# Drop 'Label Num' column from the features for clustering
numeric_df = numeric_df.drop('Label Num', axis=1)

# Convert to NumPy array and scale the data
X = numeric_df.to_numpy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (optional for clustering)
X_train, X_test, y_train_labels, y_test_labels = train_test_split(X_scaled, labels, test_size=0.2, random_state=42)

# Define GAN architecture
latent_dim = 100

# Generator
generator = Sequential()
generator.add(Dense(128, input_dim=latent_dim, activation='relu'))
generator.add(Dense(X_train.shape[1], activation='tanh'))

# Discriminator
discriminator = Sequential()
discriminator.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
discriminator.add(Dense(1, activation='sigmoid'))
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# GAN combining generator and discriminator
discriminator.trainable = False
gan = Sequential()
gan.add(generator)
gan.add(discriminator)
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Training the GAN
epochs = 1000
batch_size = 32

for epoch in range(epochs):
    # Generate noise for synthetic data
    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    synthetic_samples = generator.predict(noise, verbose=0)
    
    # Sample a random batch of real data
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    real_samples = X_train[idx]

    # Train the discriminator on real and synthetic data
    discriminator_loss_real = discriminator.train_on_batch(real_samples, np.ones((batch_size, 1)))
    discriminator_loss_fake = discriminator.train_on_batch(synthetic_samples, np.zeros((batch_size, 1)))

    # Train the generator (via the GAN model)
    gan_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    # Optional: Print progress every 1000 epochs
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}/{epochs} | Discriminator Loss Real: {discriminator_loss_real[0]}, Fake: {discriminator_loss_fake[0]} | GAN Loss: {gan_loss}")

# Use the trained generator to create synthetic data
num_synthetic_samples = X_train.shape[0]  # You can choose how many synthetic samples to generate
noise = np.random.normal(0, 1, size=(num_synthetic_samples, latent_dim))
synthetic_samples = generator.predict(noise, verbose=0)  # Suppress generation messages

# Combine real and synthetic data for clustering
X_train_combined = np.vstack([X_train, synthetic_samples])
y_train_combined = np.hstack([y_train_labels, np.zeros(num_synthetic_samples)])  # Use labels accordingly

# Define the pipeline with KMeans (redefine this after GAN)
pipeline = Pipeline([
    ('kmeans', KMeans())
])

# Setup the grid search parameters for KMeans
param_grid = {
    'kmeans__n_clusters': [5, 7, 9, 11],
    'kmeans__n_init': [10, 20],
    'kmeans__algorithm': ['lloyd', 'full', 'elkan']
}

# Davies-Bouldin score as the scoring metric
def davies_bouldin_scorer(estimator, X):
    labels = estimator.predict(X)
    return -davies_bouldin_score(X, labels)  # Minimize Davies-Bouldin score

# Initialize the grid search for KMeans clustering
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=davies_bouldin_scorer,
    cv=5,
)

# Perform grid search on the combined dataset
grid_search.fit(X_train_combined)

# Make predictions on the test set (unmodified test set)
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the Davies-Bouldin score for the test set
db_score_test = davies_bouldin_score(X_test, y_pred)

# Print the Davies-Bouldin score for the test set
print("Davies-Bouldin Score for the test data:", db_score_test)

# Create a DataFrame to examine the distribution between clusters and actual labels
data_with_predictions = pd.DataFrame({
    'Prediction': y_pred,  # Predicted clusters
    'Actual': y_test_labels.reset_index(drop=True)  # Original labels (0 or 1)
})

# Group by cluster prediction and actual label, then count occurrences
distribution = pd.crosstab(data_with_predictions['Prediction'], data_with_predictions['Actual'])

# Print the distribution matrix
print(distribution)

Epoch 0/1000 | Discriminator Loss Real: 1.0711919069290161, Fake: 0.626408576965332 | GAN Loss: 0.8312425017356873


Traceback (most recent call last):
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
  File "C:\Users\zhong.DESKTOP-V7DRVU6\AppData\Local\Temp\ipykernel_8032\2266350093.py", line 106, in davies_bouldin_scorer
    return -davies_bouldin_score(X, labels)  # Minimize Davies-Bouldin score
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\utils\_param_validation.py", line 214, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 407, in davies_bouldin_score
    check_number_of_labels(n_labels, n_samples)
  File "C:\Users\zhong.DESKTOP-V7DRVU6\anaconda3\envs\tf\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 37, in check_number_of_labels
    raise ValueError(
ValueError: Number of labels is 1. Valid values 

Davies-Bouldin Score for the test data: 0.8943141504402686
Actual         0      1
Prediction             
0            309   4318
1           6411      9
2              5      0
3            328  11970
4           1702      0
5           4844      1
6              1      0
7            101      0
8             32      0
9           4435   9426
10          1251      0
