In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.pipeline import Pipeline

# Load and preprocess the data
df = pd.read_csv('FridayAfternoon.csv')
df2 = df.drop(' Label', axis=1)
df2 = df2.dropna()

# Replace infinite values with NaN and drop rows with NaNs
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2_cleaned = df2.dropna()

# Select only numeric columns, keeping 'Label Num' for later comparison
numeric_df = df2_cleaned.select_dtypes(include=[np.number])

# Extract the 'Label Num' for the ground truth comparison later
labels = numeric_df['Label Num']

# Drop 'Label Num' column from the features for clustering
numeric_df = numeric_df.drop('Label Num', axis=1)

# Convert to NumPy array and scale the data
X = numeric_df.to_numpy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (optional for clustering)
X_train, X_test, y_train_labels, y_test_labels = train_test_split(X_scaled, labels, test_size=0.2, random_state=42)

# Setup the grid search parameters for KMeans
param_grid = {
    'kmeans__n_clusters': [5, 7, 9, 11],
    'kmeans__n_init': [10, 20],
    'kmeans__algorithm': ['auto', 'full', 'elkan']
}

# Define the pipeline with KMeans
pipeline = Pipeline([
    ('kmeans', KMeans())
])

# Davies-Bouldin score as the scoring metric
def davies_bouldin_scorer(estimator, X):
    labels = estimator.predict(X)
    return -davies_bouldin_score(X, labels)  # Minimize Davies-Bouldin score

# Perform grid search with verbosity to track progress
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=davies_bouldin_scorer,
    cv=5,
)

# Perform grid search
grid_search.fit(X_train)

# Best parameters and Davies-Bouldin score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (negative Davies-Bouldin):", -grid_search.best_score_)

# Make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate the Davies-Bouldin score for the test set
db_score_test = davies_bouldin_score(X_test, y_pred)

# Print the Davies-Bouldin score for the test set
print("Davies-Bouldin Score for the test data:", db_score_test)

# Create a DataFrame to examine the distribution between clusters and actual labels
data_with_predictions = pd.DataFrame({
    'Prediction': y_pred,  # Predicted clusters
    'Actual': y_test_labels.reset_index(drop=True)  # Original labels (0 or 1)
})

# Group by cluster prediction and actual label, then count occurrences
distribution = pd.crosstab(data_with_predictions['Prediction'], data_with_predictions['Actual'])

# Print the distribution matrix
print(distribution)



Best Parameters: {'kmeans__algorithm': 'auto', 'kmeans__n_clusters': 9, 'kmeans__n_init': 20}
Best Score (negative Davies-Bouldin): 0.7695632343699044
Actual         0      1
Prediction             
0           9312   9403
1            313  11994
2           1705      0
3            307   4318
4            101      0
5           1246      0
6             33      0
7              1      0
8           6401      9
