In [5]:
import numpy as np
import pandas as pd

def pca(X, num_components):
    # Standardize the data (subtract mean and divide by standard deviation)
    X_standardized = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

    # Compute the covariance matrix
    cov_matrix = np.cov(X_standardized, rowvar=False)

    # Compute eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    # Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top 'num_components' eigenvectors
    principal_components = eigenvectors[:, :num_components]

    # Project the standardized data onto the principal components
    transformed_data = np.dot(X_standardized, principal_components)

    return transformed_data, eigenvalues, principal_components

# Load the diabetes dataset
diabetes_data = pd.read_csv("diabetes.csv")

# Extract features (X) and target variable (y)
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']

# Apply PCA
num_components = 2
transformed_data, eigenvalues, principal_components = pca(X.values, num_components)

# Print the results
print("Original data shape:", X.shape)
print("Transformed data shape:", transformed_data.shape)
print("Eigenvalues:", eigenvalues)
print("Principal components:")
print(principal_components)


Original data shape: (768, 8)
Transformed data shape: (768, 2)
Eigenvalues: [2.09711056 1.73346726 1.03097228 0.87667054 0.76333832 0.68351839
 0.42036353 0.40498938]
Principal components:
[[-0.1284321  -0.59378583]
 [-0.39308257 -0.17402908]
 [-0.36000261 -0.18389207]
 [-0.43982428  0.33196534]
 [-0.43502617  0.25078106]
 [-0.45194134  0.1009598 ]
 [-0.27061144  0.122069  ]
 [-0.19802707 -0.62058853]]


In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the diabetes dataset
diabetes_data = pd.read_csv("diabetes.csv")

# Extract features (X) and target variable (y)
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']

# Apply PCA
pca = PCA(n_components=2)
transformed_data = pca.fit_transform(X)

# Output the transformed data
print("Original data shape:", X.shape)
print("Transformed data shape:", transformed_data.shape)
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Principal components:")
print(pca.components_)


Original data shape: (768, 8)
Transformed data shape: (768, 2)
Explained variance ratio: [0.88854663 0.06159078]
Principal components:
[[-2.02176587e-03  9.78115765e-02  1.60930503e-02  6.07566861e-02
   9.93110844e-01  1.40108085e-02  5.37167919e-04 -3.56474430e-03]
 [-2.26488861e-02 -9.72210040e-01 -1.41909330e-01  5.78614699e-02
   9.46266913e-02 -4.69729766e-02 -8.16804621e-04 -1.40168181e-01]]
