In [None]:
import pandas as pd
from google.colab import files
# Upload the file from your local system
uploaded = files.upload()
data = pd.read_csv("Normalized_GSE48350_expression_data.csv")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# Load the normalized dataset (Replace with your file name)
data = pd.read_csv("Normalized_GSE48350_expression_data.csv")

# Optional: Assume ID values are in a column named 'ID'
ids = data['ID'] if 'ID' in data.columns else np.arange(len(data))
data.drop(columns=['ID'], errors='ignore', inplace=True)

# Step 1: PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 dimensions for efficient clustering
data_pca = pca.fit_transform(data)

# Step 2: K-Means clustering for K = 2, 3, 4, 5
results = {'K': [], 'Silhouette': [], 'Calinski-Harabasz': []}
clusters_info = {}

for k in range(2, 6):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(data_pca)

    # Evaluation metrics
    silhouette = silhouette_score(data_pca, labels)
    calinski = calinski_harabasz_score(data_pca, labels)

    results['K'].append(k)
    results['Silhouette'].append(silhouette)
    results['Calinski-Harabasz'].append(calinski)

    # Prepare data for the CSV file
    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    # Find the closest point (ID) to each cluster center
    cluster_ids = []
    for center in cluster_centers:
        distances = np.linalg.norm(data_pca - center, axis=1)
        closest_idx = np.argmin(distances)
        cluster_ids.append(ids.iloc[closest_idx] if hasattr(ids, 'iloc') else ids[closest_idx])

    # Create the DataFrame
    cluster_data = pd.DataFrame({'ID_REF': cluster_ids, 'Cluster': np.arange(k)})
    cluster_gsm_values = pd.DataFrame([
        data.iloc[np.where(cluster_labels == cluster)[0][0]].values
        for cluster in range(k)
    ], columns=data.columns)

    cluster_csv = pd.concat([cluster_data, cluster_gsm_values], axis=1)

    # Save the CSV file
    cluster_csv.to_csv(f'cluster_info_k{k}.csv', index=False)

    # Plot clusters without IDs (clean plot)
    plt.figure(figsize=(8, 6))
    for cluster in np.unique(labels):
        cluster_points = data_pca[labels == cluster]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}')

    plt.title(f'Cluster Plot for K={k}')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.savefig(f'cluster_plot_k{k}.png')
    plt.show()

# Step 3: Display evaluation metrics
print("\nEvaluation Metrics:")
results_df = pd.DataFrame(results)
print(results_df.round(4))  # Round the results to 4 decimal places

# Plot evaluation metrics
plt.figure(figsize=(12, 6))
plt.plot(results['K'], results['Silhouette'], marker='o', label='Silhouette Score')
plt.plot(results['K'], results['Calinski-Harabasz'], marker='^', label='Calinski-Harabasz Score')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Score')
plt.title('Evaluation Metrics for Different K')
plt.legend()
plt.show()
