<a href="https://colab.research.google.com/github/Nikhilesh-075/6thSem-ML-Lab/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Upload datasets
uploaded = files.upload()

# Load datasets (make sure filenames match the uploaded ones)
diabetes_df = pd.read_csv("diabetes_data_upload.csv")
adult_df = pd.read_csv("adult.csv")

# ----------- Helper Functions -----------

def preprocess_dataset(df):
    # Drop missing values
    df = df.dropna()
    # Convert categorical to numerical
    df = pd.get_dummies(df, drop_first=True)
    return df

def apply_pca(df, label_column=None, dataset_name="Dataset"):
    # If label exists, separate it
    if label_column and label_column in df.columns:
        y = df[label_column]
        X = df.drop(label_column, axis=1)
    else:
        X = df
        y = None

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    # Create DataFrame for plotting
    pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
    if y is not None:
        pca_df["Label"] = y

    # Plot
    plt.figure(figsize=(8, 6))
    sns.set(style="whitegrid")
    if y is not None:
        sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Label", palette="Set1")
    else:
        plt.scatter(pca_df["PC1"], pca_df["PC2"])
    plt.title(f"PCA of {dataset_name}")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.show()

    # Print explained variance
    print(f"Explained Variance Ratio for {dataset_name}:")
    print(pca.explained_variance_ratio_)

# ----------- Apply PCA on Diabetes Dataset -----------
print("\n--- PCA on Diabetes Dataset ---")
diabetes_df_processed = preprocess_dataset(diabetes_df)
apply_pca(diabetes_df_processed, label_column="class", dataset_name="Diabetes Dataset")

# ----------- Apply PCA on Adult Income Dataset -----------
print("\n--- PCA on Adult Income Dataset ---")
adult_df_processed = preprocess_dataset(adult_df)
apply_pca(adult_df_processed, label_column="income", dataset_name="Adult Income Dataset")
