In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load your CSV file
file_path = 'cluster_timeseries.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

# Drop the 'StationID' column
data_features = data.drop(['StationID', '60'], axis=1)

# Min-Max normalization
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data_features)

# Perform K-means clustering
number_of_clusters = 6  # You can change the number of clusters as needed
kmeans = KMeans(n_clusters=number_of_clusters, random_state=123)
data['Cluster'] = kmeans.fit_predict(data_normalized)
mse = kmeans.inertia_ / len(data_normalized)  # MSE calculation
print(f'Mean Squared Error (MSE) for Clustering: {mse}')

# Perform PCA for each cluster and obtain the components and eigenvalues
components_dict = {}
eigenvalues_dict = {}

for cluster in range(number_of_clusters):
    cluster_data = data_normalized[data['Cluster'] == cluster]
    
    # Perform PCA
    pca = PCA()
    pca.fit(cluster_data)
    
    # Store the components and eigenvalues
    components_dict[cluster] = pca.components_
    eigenvalues_dict[cluster] = pca.explained_variance_

# Create separate matrices for components and eigenvalues
for cluster in range(number_of_clusters):
    print(f"Cluster {cluster + 1}:")
    print("Components:")
    print(components_dict[cluster])
    plt.imshow(components_dict[cluster][:, 0:2])
    print("Eigenvalues:")
    print(eigenvalues_dict[cluster])
    print("\n")
    
# Save updated DataFrame with cluster labels
data.to_csv('cluster_results.csv', index=False)

# Plot explained variance for each cluster
for cluster in range(number_of_clusters):
    plt.figure(figsize=(6, 4))
    plt.plot(range(1, len(eigenvalues_dict[cluster]) + 1), eigenvalues_dict[cluster], marker='o')
    plt.title(f'Explained Variance for Cluster {cluster + 1}')
    plt.xlabel('Principal Component')
    plt.ylabel('Eigenvalue')
    plt.grid()
    plt.show()


In [None]:
from sklearn import manifold
from scipy.spatial import ConvexHull
pca_vis = PCA(n_components=2)
valid_pca = pca_vis.fit_transform(data_normalized)
# now use t-SNE to reduce dimension to 2
# valid_tsne = manifold.TSNE(n_components=2).fit_transform(valid_y)
results = kmeans.fit_predict(data_normalized)
scatter = plt.scatter(valid_pca[:, 0], valid_pca[:, 1], c=results)

legend_labels = [f'Cluster {i}' for i in range(kmeans.n_clusters)]
handles = [plt.Line2D([0], [0], marker='o', color='w', label=legend_labels[i],
                      markerfacecolor=scatter.cmap(scatter.norm(i)), markersize=10) 
           for i in range(kmeans.n_clusters)]

plt.xlabel(f'PC1 {round(pca_vis.explained_variance_ratio_[0], 3)}')
plt.ylabel(f'PC2 {round(pca_vis.explained_variance_ratio_[1], 3)}')
plt.legend(handles=handles, title="Clusters")
plt.grid()
plt.show()
# plt.scatter(valid_y[:, 0], valid_y[:, 1], c=result)
plt.savefig("cluster.png")