In [None]:
# Kmeans+PCA
# Change the NAME.csv

import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

# Load the dataset
df = pd.read_csv('NAME.csv')

# The first column is the class, the rest are the FTIR spectra data (wavenumbers)
# We need to separate the features (spectra) from the labels (class)
X = df.iloc[:, 1:] # All columns from the second one onwards
y = df.iloc[:, 0]  # The first column is the class

# Convert original string labels to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Apply K-means clustering
# We expect 3 clusters based on the description
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) # Added n_init to avoid future warnings
kmeans.fit(X)

# Get the cluster labels assigned by K-means
cluster_labels = kmeans.labels_

# You can now analyze the results.
# One way is to see how the K-means clusters align with the original classes.
# A contingency matrix can be useful for this.
print("Confusion Matrix (Original Class vs K-means Cluster):")
# Use the encoded original labels for the confusion matrix
print(confusion_matrix(y_encoded, cluster_labels))

# Visualize the results (optional, requires dimensionality reduction like PCA)
# To visualize high-dimensional data like spectra, we usually reduce its dimensions first.
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.7)
plt.title('K-means Clustering of FTIR Spectra (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(scatter, label='K-means Cluster')

# Optionally, plot the original classes alongside the clusters for comparison
# plt.figure(figsize=(10, 6))
# scatter_original = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_encoded, cmap='viridis', s=50, alpha=0.7) # Use encoded labels here too
# plt.title('Original Classes of FTIR Spectra (PCA Reduced)')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.colorbar(scatter_original, label='Original Class')

plt.show()

In [None]:
# Classifiaction Report

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import numpy as np

# Generate the classification report using the original encoded labels and the predicted cluster labels
# Since K-means is an unsupervised method, the cluster labels may not directly correspond to the original class labels.
# We can use the classification report to see how well each original class is represented in the clusters.
# Note: The report will treat clusters as if they are predicted classes. The 'accuracy' in this context
# doesn't mean correct classification in the supervised sense, but rather the overall agreement
# between the original labels and the cluster labels after potentially remapping clusters to classes.
# For unsupervised learning, interpretation is more about finding structure in data.

# To get a meaningful classification report, we might need to map clusters to classes based on the confusion matrix
# For simplicity here, we'll generate the report directly, understanding its interpretation limitations
# for unsupervised learning where cluster labels don't have inherent meaning before mapping.

report = classification_report(y_encoded, cluster_labels, target_names=label_encoder.classes_, output_dict=True)

# Convert the report dictionary to a pandas DataFrame for better visualization
report_df = pd.DataFrame(report).transpose()

# Drop the 'support' row as it's not needed for the table visualization
# report_df = report_df.drop('support')

# Calculate accuracy (this is the accuracy score from the report, which is the average recall for the clusters)
# In unsupervised learning, this is often interpreted as a measure of agreement rather than prediction accuracy.
accuracy = report_df.loc['accuracy', 'f1-score'] # f1-score column for accuracy row contains the overall accuracy

# Visualize the classification report as a table using matplotlib
fig, ax = plt.subplots(figsize=(10, report_df.shape[0] * 0.8)) # Adjust figure size as needed
ax.axis('tight')
ax.axis('off')

# Create the table
table = ax.table(cellText=report_df.values.round(2), # Round values for better readability
                 colLabels=report_df.columns,
                 rowLabels=report_df.index,
                 cellLoc='center',
                 loc='center')

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.2) # Adjust scale for better fit

plt.title('Classification Report (Original Class vs K-means Cluster)', y=0.95, fontsize=14)
plt.show()

# Print the accuracy
print(f"\nAcurácia (Overall Agreement): {accuracy:.2f}")

# Note on interpreting the Classification Report for K-Means:
# Since K-Means is unsupervised, the cluster labels (0, 1, 2) don't have a predefined meaning
# corresponding to the original classes ('Class1', 'Class2', 'Class3').
# The classification report here shows how well each original class is spread across the generated clusters.
# For example, the row for 'Class1' shows the precision, recall, and f1-score for items belonging to 'Class1'
# as if we were trying to predict 'Class1' using the cluster labels. This isn't a true supervised
# performance metric, but rather an indication of how much the clusters capture the structure of the original classes.
# The 'accuracy' reported is the overall accuracy score when the original labels are compared to the cluster labels
# as if the cluster labels were predictions. This is often the average recall across the clusters.
# To get a more meaningful interpretation, one would typically analyze the confusion matrix and
# potentially relabel the clusters to correspond to the original classes based on majority voting within each cluster.
