In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np

# dataset
data = {
    'x': [12, 17, 12, 6, 17, 4],
    'y': [24, 15.5, 13, 13.5, 21, 20.3],
    'z': [6, -2, 3, -2.5, 7.2, -0.9]
}

df = pd.DataFrame(data)

# 1: Calculate the mean and standard deviation for each variable
means = df.mean()
std_devs = df.std()

print("Means:" , means)
print("\nStandard Deviations:" , std_devs)

Means: x    11.333333
y    17.883333
z     1.800000
dtype: float64

Standard Deviations: x    5.428321
y    4.510174
z    4.206186
dtype: float64


In [3]:
# 2: Standardize the data using the Z-score
standardized_data = (df - means) / std_devs
print("Standardized Data (Matrix S):" , standardized_data)

Standardized Data (Matrix S):           x         y         z
0  0.122813  1.356193  0.998529
1  1.043908 -0.528435 -0.903431
2  0.122813 -1.082737  0.285294
3 -0.982502 -0.971877 -1.022304
4  1.043908  0.691030  1.283823
5 -1.350940  0.535826 -0.641912


In [4]:
# 3: Calculate the covariance matrix C
covariance_matrix = standardized_data.cov()

print("\nCovariance Matrix C:" , covariance_matrix)


Covariance Matrix C:           x         y         z
x  1.000000  0.086864  0.485273
y  0.086864  1.000000  0.611893
z  0.485273  0.611893  1.000000


In [5]:
# 4: Find the eigenvalues and eigenvectors

eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print("Eigenvalues:" , eigenvalues)
print("Eigenvectors:" , eigenvectors)


Eigenvalues: [1.82462432 0.91547483 0.25990085]
Eigenvectors: [[-0.46348371 -0.78498896 -0.41106591]
 [-0.55899398  0.61896667 -0.55173   ]
 [-0.68753806  0.0259345   0.72568507]]


In [6]:
# 5: Sort eigenvalues and rearrange eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

print("Sorted Eigenvalues:" , sorted_eigenvalues)
print("Sorted Eigenvectors:" , sorted_eigenvectors)


Sorted Eigenvalues: [1.82462432 0.91547483 0.25990085]
Sorted Eigenvectors: [[-0.46348371 -0.78498896 -0.41106591]
 [-0.55899398  0.61896667 -0.55173   ]
 [-0.68753806  0.0259345   0.72568507]]


In [7]:
# 6: Decide how many principal components to keep
total_variance = np.sum(sorted_eigenvalues)
explained_variance_ratio = sorted_eigenvalues / total_variance
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Find the number of components to keep (e.g., 90% of variance)
threshold_variance = 0.9
num_components_to_keep = np.argmax(cumulative_explained_variance >= threshold_variance) + 1

print("Explained Variance Ratio:" , explained_variance_ratio)
print("Cumulative Explained Variance:" , cumulative_explained_variance)
print("Number of Principal Components to Keep:", num_components_to_keep)

Explained Variance Ratio: [0.60820811 0.30515828 0.08663362]
Cumulative Explained Variance: [0.60820811 0.91336638 1.        ]
Number of Principal Components to Keep: 2


In [8]:
# 7: Form the projection matrix V

top_eigenvalues = sorted_eigenvalues[:num_components_to_keep]
top_eigenvectors = sorted_eigenvectors[:, :num_components_to_keep]

projection_matrix_V = top_eigenvectors

print("Projection Matrix V:" , projection_matrix_V)

Projection Matrix V: [[-0.46348371 -0.78498896]
 [-0.55899398  0.61896667]
 [-0.68753806  0.0259345 ]]


In [9]:
# 8: Calculate the reduced data matrix R
standardized_data = (df - df.mean()) / df.std()

reduced_data_matrix_R = standardized_data.dot(projection_matrix_V)

print("\nReduced Data Matrix R:" , reduced_data_matrix_R)


Reduced Data Matrix R:           0         1
0 -1.501552  0.768928
1  0.432701 -1.169970
2  0.352171 -0.759186
3  1.701520  0.143181
4 -1.752794 -0.358436
5  0.767954  1.375483
