<a href="https://colab.research.google.com/github/Nuel2627/DS_ML_Project_colab-integration/blob/main/PCA_K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
def run_pca_kmeans_analysis():
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from google.colab import files
    import io

    # Step 1: Upload file
    uploaded = files.upload()
    for filename in uploaded:
        try:
            df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='latin1')
        print(f"Loaded: {filename}")
        display(df.head())
        break

    # Step 2: Clean and prepare data
    features = ['SEX', 'AGE_P', 'SLEEP', 'race_cat', 'Marital', 'smoke_cat', 'drinking_cat',
                'activity_cat', 'BMI_cat', 'AGE_cat', 'sleeptime_cat', 'cardiov_cat',
                'Met_Endo_cat', 'Respit_cat', 'Muscul_cat', 'Cancer_cat', 'morbidity_cat']

    df_clean = df.dropna(subset=features + ['os_censor', 'Overall Survival (Months)']).copy()

    # Step 3: One-hot encode categorical variables
    X = pd.get_dummies(df_clean[features], drop_first=True)

    # Step 4: Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 5: PCA (keep 95% of variance)
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    # Step 6: Determine optimal K using Elbow method
    inertia = []
    K_range = range(1, 11)
    for k in K_range:
        km = KMeans(n_clusters=k, random_state=42)
        km.fit(X_pca)
        inertia.append(km.inertia_)

    plt.figure(figsize=(8, 5))
    plt.plot(K_range, inertia, marker='o', linestyle='-')
    plt.title('Elbow Method: Optimal K')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Inertia')
    plt.xticks(K_range)
    plt.grid(True)
    plt.show()

    # Step 7: Apply KMeans (choose K=3 for now, can adjust)
    optimal_k = 3
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    clusters = kmeans.fit_predict(X_pca)

    df_clean['cluster'] = clusters
    df_clean['PC1'] = X_pca[:, 0]
    df_clean['PC2'] = X_pca[:, 1]

    # Step 8: Scree Plot
    pca_full = PCA().fit(X_scaled)
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(pca_full.explained_variance_ratio_)+1),
             pca_full.explained_variance_ratio_, marker='o', linestyle='--')
    plt.title('Scree Plot: Explained Variance by Principal Component')
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.xticks(range(1, len(pca_full.explained_variance_ratio_)+1))
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Step 9: PCA Biplot
    loadings = pca.components_.T[:, :2]
    feature_names = X.columns
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='Set2', alpha=0.6)
    for i, feature in enumerate(feature_names):
        plt.arrow(0, 0, loadings[i, 0]*3, loadings[i, 1]*3, color='black', alpha=0.5, head_width=0.05)
        plt.text(loadings[i, 0]*3.2, loadings[i, 1]*3.2, feature, fontsize=9)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('PCA Biplot')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Step 10: Final Scatter Plot with Clusters
    plt.figure(figsize=(8, 5))
    plt.scatter(df_clean['PC1'], df_clean['PC2'], c=clusters, cmap='plasma', s=50)
    plt.title('PCA Projection with K-means Clustering')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(label='Cluster')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
def run_pca_kmeans_analysis():