In [78]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

In [79]:
def standardize(X):
    if X.size == 0:
        raise ValueError("Input array X is empty.")
    mean = np.mean(X, axis=0)
    std_dev = np.std(X, axis=0)
    if np.any(std_dev == 0):
        raise ValueError("Standard deviation is zero for one or more features, cannot standardize.")
    X_std = (X - mean) / std_dev
    return X_std

In [80]:
def covariance_matrix(X):
    if X.size == 0:
        raise ValueError("Input array X is empty.")
    m = X.shape[0]
    return (1 / (m - 1)) * X.T @ X

In [81]:
def qr_decomposition_householder(A, max_iter=100, tol=1e-8):
    if isinstance(A, pd.DataFrame):
        A = A.values

    n = A.shape[0]
    Q = np.eye(n)
    R = A.copy()

    for i in range(min(n, max_iter)):
        x = R[i:, i]
        if np.max(np.abs(x)) < tol:
            break
        e = np.zeros_like(x)
        e[0] = norm(x)
        u = x - e
        u /= norm(u)
        H = np.eye(n)
        H[i:, i:] -= 2.0 * np.outer(u, u)
        R = H @ R
        Q = Q @ H.T
    return Q, R

In [82]:
def eigen_decomp(A, max_iter=100, tol=1e-8):
    n = A.shape[0]
    A_k = A.copy()
    Q_total = np.eye(n)
    
    for _ in range(max_iter):
        Q, R = qr_decomposition_householder(A_k)
        A_k = R @ Q
        Q_total = Q_total @ Q
        
        if np.allclose(A_k - np.diag(np.diagonal(A_k)), 0, atol=tol):
            break

    eigenvalues = np.diagonal(A_k)
    eigenvectors = Q_total

    for i in range(n):
        eigenvectors[:, i] /= norm(eigenvectors[:, i])

    return eigenvalues, eigenvectors

In [83]:
def pca(X, threshold):
    X_std = standardize(X)
    cov_matrix = covariance_matrix(X_std)
    eigenvalues, eigenvectors = eigen_decomp(cov_matrix)
    total_variance = np.sum(eigenvalues)
    variance_ratio = eigenvalues / total_variance
    cumulative_variance_ratio = np.cumsum(variance_ratio)
    n_components = np.argmax(cumulative_variance_ratio >= threshold) + 1
    V_k  = eigenvectors[:, :n_components]
    Z = np.dot(X_std, V_k)

    return Z, V_k, n_components

### Open the dataset

In [84]:
df = pd.read_csv("pokindex_data.csv")
df

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,WinningPercentage
0,70,90,45,15,45,50,36.290323
1,40,27,60,37,50,66,36.220472
2,75,75,60,83,60,60,39.344262
3,85,115,80,105,80,50,30.630631
4,83,106,65,86,65,85,66.406250
...,...,...,...,...,...,...,...
195,50,65,64,44,48,43,21.969697
196,60,85,69,65,79,80,57.600000
197,45,50,43,40,38,62,40.441176
198,55,45,50,45,65,80,55.462185


In [85]:
X_df = df.iloc[:, :-1]
X_df

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,70,90,45,15,45,50
1,40,27,60,37,50,66
2,75,75,60,83,60,60
3,85,115,80,105,80,50
4,83,106,65,86,65,85
...,...,...,...,...,...,...
195,50,65,64,44,48,43
196,60,85,69,65,79,80
197,45,50,43,40,38,62
198,55,45,50,45,65,80


### Standardize the dataset

In [86]:
std_X = standardize(X_df)
std_X

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,0.152909,0.405283,-0.922707,-1.788067,-1.045263,-0.649041
1,-1.180600,-1.596508,-0.433985,-1.110710,-0.852552,-0.094897
2,0.375161,-0.071334,-0.433985,0.305580,-0.467131,-0.302701
3,0.819664,1.199645,0.217644,0.982936,0.303712,-0.649041
4,0.730763,0.913675,-0.271078,0.397947,-0.274420,0.563148
...,...,...,...,...,...,...
195,-0.736097,-0.389078,-0.303659,-0.895188,-0.929636,-0.891478
196,-0.291594,0.246411,-0.140752,-0.248621,0.265170,0.389979
197,-0.958349,-0.865695,-0.987870,-1.018344,-1.315058,-0.233433
198,-0.513846,-1.024568,-0.759800,-0.864399,-0.274420,0.389979


### Get the covariance matrix

In [87]:
cov_matrix = covariance_matrix(std_X)
cov_matrix

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
HP,1.005025,0.555958,0.407711,0.423108,0.414206,0.250041
Attack,0.555958,1.005025,0.492852,0.450232,0.28881,0.392548
Defense,0.407711,0.492852,1.005025,0.265119,0.592316,-0.052182
Sp. Atk,0.423108,0.450232,0.265119,1.005025,0.527509,0.506228
Sp. Def,0.414206,0.28881,0.592316,0.527509,1.005025,0.193899
Speed,0.250041,0.392548,-0.052182,0.506228,0.193899,1.005025


In [88]:
Q, R = qr_decomposition_householder(cov_matrix)
print(Q)
print(R)

[[ 0.72940987 -0.54500747 -0.24840545 -0.30687205 -0.12216702  0.01129624]
 [ 0.40349392  0.75994714 -0.16466594 -0.32524921  0.14040878 -0.32719482]
 [ 0.29590123  0.18570912  0.78317796 -0.10122822 -0.36807509  0.34476142]
 [ 0.3070758   0.09953746 -0.10013326  0.78853923 -0.41568206 -0.30196751]
 [ 0.30061505 -0.13379389  0.38231174  0.32291456  0.79276163 -0.11323736]
 [ 0.18147012  0.25129826 -0.37632858  0.25244288  0.16917076  0.81850378]]
[[ 1.37786061e+00  1.25319031e+00  1.04364078e+00  1.12779520e+00
   1.09322398e+00  7.21453556e-01]
 [ 6.88979836e-17  6.57112083e-01  2.73006452e-01  3.17464948e-01
   7.05012515e-02  4.29358093e-01]
 [-2.21615860e-17  1.46132234e-18  8.24219748e-01 -6.10760271e-02
   5.71882787e-01 -5.22398205e-01]
 [ 9.00769797e-18  2.21031363e-17  1.46288764e-18  7.87520880e-01
   5.08444916e-01  5.16381194e-01]
 [-5.30781861e-17 -1.59200093e-17  8.03060225e-17 -1.13869465e-16
   3.82203678e-01  1.57084194e-01]
 [-7.44354643e-18  7.97591808e-18 -8.7140215

In [89]:
eigenvalues, eigenvectors = eigen_decomp(cov_matrix)

print(eigenvalues)
print(eigenvectors)

[2.94914287 1.20231921 0.77529324 0.51470719 0.37558598 0.21310226]
[[ 0.43354602 -0.06079581  0.39576925 -0.77440082  0.17656095  0.14435383]
 [ 0.44686717  0.06341416  0.57662133  0.39600225 -0.21287504 -0.51152972]
 [ 0.3863902  -0.57758994  0.03447059  0.41114815  0.10449577  0.57960512]
 [ 0.44243289  0.30046001 -0.38034996 -0.09211424 -0.71540397  0.22141129]
 [ 0.42577579 -0.27598307 -0.6008831  -0.07390754  0.31323149 -0.52717316]
 [ 0.29328878  0.7015907  -0.06292642  0.2459582   0.55015842  0.23371506]]


### Get the Z dataset, transformation matrix, and optimal k for PCA

In [90]:
Z, V_k, n_components = pca(X_df, 0.90)

principal_components = pd.DataFrame(data=Z, columns=[f"PC{i+1}" for i in range(n_components)])
principal_components

Unnamed: 0,PC1,PC2,PC3,PC4
0,-1.535627,-0.154778,1.611419,-0.254967
1,-2.275202,0.056187,-0.462071,0.245586
2,-0.189388,0.231696,0.275896,-0.565281
3,1.349380,-0.343314,0.508127,-0.342828
4,0.844757,0.760486,0.784812,-0.193403
...,...,...,...,...
195,-1.663667,-0.442388,0.429042,0.223010
196,0.046590,0.240373,-0.067483,0.364742
197,-2.262974,0.467138,0.279696,0.126750
198,-1.359104,0.494743,-0.351214,-0.124375


In [91]:
V_k

array([[ 0.43354602, -0.06079581,  0.39576925, -0.77440082],
       [ 0.44686717,  0.06341416,  0.57662133,  0.39600225],
       [ 0.3863902 , -0.57758994,  0.03447059,  0.41114815],
       [ 0.44243289,  0.30046001, -0.38034996, -0.09211424],
       [ 0.42577579, -0.27598307, -0.6008831 , -0.07390754],
       [ 0.29328878,  0.7015907 , -0.06292642,  0.2459582 ]])

In [92]:
n_components

np.int64(4)