In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, BatchNormalization, LeakyReLU
from tensorflow.keras import Model
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# Load and preprocess the data
df = pd.read_csv('FridayAfternoon.csv')
df2 = df.drop(' Label', axis=1)
df2 = df2.dropna()

# Replace infinite values with NaN and drop rows with NaNs
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2_cleaned = df2.dropna()

# Select only numeric columns, keeping 'Label Num' for later comparison
numeric_df = df2_cleaned.select_dtypes(include=[np.number])

# Extract the 'Label Num' for the ground truth comparison later
labels = numeric_df['Label Num']

# Drop 'Label Num' column from the features for clustering
numeric_df = numeric_df.drop('Label Num', axis=1)

# Convert to NumPy array and scale the data
X = numeric_df.to_numpy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train_labels, y_test_labels = train_test_split(X_scaled, labels, test_size=0.2, random_state=42)

# Define Transformer Encoder layer
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim)]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Hyperparameters
embed_dim = 64  # Embedding dimension for each feature
num_heads = 4   # Number of attention heads
ff_dim = 128    # Feed forward network dimension

# Define the Transformer Encoder model
inputs = tf.keras.Input(shape=(X_scaled.shape[1], 1))  # Input shape (batch_size, sequence_length, features)
transformer_block = TransformerEncoder(embed_dim, num_heads, ff_dim)
x = transformer_block(inputs)
x = BatchNormalization()(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)  # Global pooling to get fixed-length vectors
encoded_output = Dense(32, activation=LeakyReLU(alpha=0.01))(x)

# Define Decoder for reconstruction
decoder = tf.keras.Sequential([
    Dense(32, activation=LeakyReLU(alpha=0.01)),
    Dense(X_scaled.shape[1], activation='linear')  # Linear for output layer
])

# Apply the decoder to the encoded output
decoded_output = decoder(encoded_output)

# Create the full autoencoder model
autoencoder_model = Model(inputs=inputs, outputs=decoded_output)
autoencoder_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')

# Reshape data for the model (batch_size, sequence_length, 1)
X_train_reshaped = np.expand_dims(X_train, axis=-1)
X_test_reshaped = np.expand_dims(X_test, axis=-1)

# Train the autoencoder model
autoencoder_model.fit(X_train_reshaped, X_train_reshaped, epochs=10, batch_size=32, validation_split=0.1)

# Extract encoded features using the encoder part
encoder_model = Model(inputs=inputs, outputs=encoded_output)
encoded_features_train = encoder_model.predict(X_train_reshaped)
encoded_features_test = encoder_model.predict(X_test_reshaped)

# Setup the grid search parameters for KMeans
param_grid = {
    'kmeans__n_clusters': [5, 7, 9, 11],
    'kmeans__n_init': [10, 20],
    'kmeans__algorithm': ['auto', 'full', 'elkan']
}

# Define the pipeline with KMeans
pipeline = Pipeline([
    ('kmeans', KMeans())
])

# Davies-Bouldin score as the scoring metric
def davies_bouldin_scorer(estimator, X):
    labels = estimator.predict(X)
    return -davies_bouldin_score(X, labels)  # Minimize Davies-Bouldin score

# Perform grid search with verbosity to track progress
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=davies_bouldin_scorer,
    cv=5,
)

# Perform grid search on encoded features (which have 32 dimensions)
grid_search.fit(encoded_features_train)

# Best parameters and Davies-Bouldin score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (negative Davies-Bouldin):", -grid_search.best_score_)

# Encode the test set using the encoder model
encoded_features_test = encoder_model.predict(X_test_reshaped)

# Make predictions on the encoded test set
y_pred = grid_search.best_estimator_.predict(encoded_features_test)

# Calculate the Davies-Bouldin score for the test set
db_score_test = davies_bouldin_score(encoded_features_test, y_pred)

# Print the Davies-Bouldin score for the test set
print("Davies-Bouldin Score for the test data (encoded features):", db_score_test)

# Create a DataFrame to examine the distribution between clusters and actual labels
data_with_predictions = pd.DataFrame({
    'Prediction': y_pred,  # Predicted clusters
    'Actual': y_test_labels.reset_index(drop=True)  # Original labels (0 or 1)
})

# Group by cluster prediction and actual label, then count occurrences
distribution = pd.crosstab(data_with_predictions['Prediction'], data_with_predictions['Actual'])

# Print the distribution matrix
print(distribution)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Best Parameters: {'kmeans__algorithm': 'elkan', 'kmeans__n_clusters': 5, 'kmeans__n_init': 10}
Best Score (negative Davies-Bouldin): 1.1122389759364566
Davies-Bouldin Score for the test data (encoded features): 1.335337388040802
Actual         0     1
Prediction            
0           6191  6899
1           3542  7330
2           1470  5238
3           8214  6257
4              2     0
