# PCA - Principal Component Analysis
    Importing all need module

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA as PCA_sk
from sklearn.preprocessing import StandardScaler

    Load iris Data Set.

In [None]:
iris = load_iris(return_X_y=True)

#print(load_iris()['data'])
#print(load_iris()['DESCR'])
#print(load_iris()['target_names'])
#iris
load_iris()


    Plot the first three columns of the Data Set

In [None]:
class PlotDataSet:
    __data_type = (np.ndarray, list, tuple)
    __style = ('.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X')

    def __init__(self,
                 data: np.ndarray|list|tuple,
                 target: np.ndarray|list|tuple
                 ):
        self.verify_dataset(data=data, obj='data')
        self.verify_dataset(data=target, obj='target')   

        self.__all_data = pd.DataFrame(np.hstack((data, target.reshape(target.size, 1))))

    @classmethod
    def verify_dataset(cls,
                       data: np.ndarray,
                       obj: str
                       ) -> None:
        if type(data) not in cls.__data_type:
            raise TypeError(f'\'{obj}\' isn\'t of the required type')
    
    def create_figure(self
                      ) -> None:
        self.__figure = plt.figure()
        self.__ax = self.__figure.add_subplot(projection='3d')

        for i, t in enumerate(list(self.__all_data.groupby([3]))):
            #x = np.array(t[1])[:, 0]
            #y = np.array(t[1])[:, 1]
            #z = np.array(t[1])[:, 2]

            #self.__ax.scatter(x, y, z, marker=self.__style[i], c=np.array(t[1])[:, 3])
            self.__ax.scatter(self.__all_data.values[:, 0], self.__all_data.values[:, 1], self.__all_data.values[:, 2], marker=self.__style[i], c=self.__all_data.values[:, 3])
        
        self.__ax.set_xlabel('X Label')
        self.__ax.set_ylabel('Y Label')
        self.__ax.set_zlabel('Z Label')

    def plot(self
             ) -> None:
        plt.show()

pdt = PlotDataSet(data=iris[0][:, :3],
                  target=iris[1])
pdt.create_figure()
pdt.plot()

    Implementation of PCA from scratch

In [None]:
class PCA:
    methods: list[str] = ['svd', 'eigen']

    def __init__(self,
                 n_componente: int=2,
                 method: str='svd'
                 ) -> None:
        self.verify_method(method=method)

        self.__n_cut = n_componente
        self.__method = method

    @classmethod
    def verify_method(cls,
                      method
                      ) -> None:
        if method not in cls.methods:
            raise ValueError(f"'{method}' is not a method implemented in this model")

    def fit(self,
            X: np.ndarray
            ) -> 'PCA':
        if self.__method == self.methods[0]:
            U, S, V = np.linalg.svd(a=X)
            self.__V = V[:self.__n_cut, :]
        
        elif self.__method == self.methods[1]:
            corr_mat = np.corrcoef(X.T)

            self.eig_vals, self.eig_vecs = np.linalg.eig(corr_mat)
            self.eig_pairs = [
                (np.abs(self.eig_vals[i]), self.eig_vecs[:, i])
                for i in range(len(self.eig_vals))
            ]
            self.eig_pairs.sort(key=lambda x: x[0], reverse=True)

            total = sum(self.eig_vals)
            self.explained_variance_ratio = [
                (i/total) * 100
                for i in sorted(self.eig_vals, reverse=True)
            ]
            self.cumulative_variance_ratio = np.cumsum(self.explained_variance_ratio)

            self.matrix_w = np.hstack(
                list((
                    self.eig_pairs[i][1].reshape(np.size(X, axis=1), 1)
                    for i in range(self.__n_cut)
                ))
            )
        
        return self
    
    def transform(self, 
                  X: np.ndarray
                  ) -> np.ndarray:
        if self.__method == self.methods[0]:
            return X.dot(self.__V.T)

        elif self.__method == self.methods[1]:
            return X.dot(self.matrix_w)
        
eigen = PCA(n_componente=2, method='eigen')
svd = PCA(n_componente=2, method='svd')

    PCA from sklearn
    

In [None]:
pca = PCA_sk(n_components=2)

     StandardScaler from sklearn

In [None]:
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris[0])

    Train all PCA instances

In [None]:
eigen.fit(X=iris[0])
svd.fit(X=iris[0])
pca.fit(X=iris[0])

    Use all PCA instances

In [None]:
eigen_iris = eigen.transform(X=iris[0])
svd_iris = svd.transform(X=iris[0])
pca_iris = pca.transform(X=iris[0])

#eigen_iris[:, :]

In [None]:
x = svd_iris
y = iris[1]

fig, axs = plt.subplots(4, 1, figsize=(5, 20))

axs[0].scatter(iris[0][:, 0], iris[0][:, 1], c=iris[1])
axs[0].set_title('Iris')

axs[1].scatter(eigen_iris[:, 0], eigen_iris[:, 1], c=iris[1])
axs[1].set_title('Eigen')

axs[2].scatter(svd_iris[:, 0], svd_iris[:, 1], c=iris[1])
axs[2].set_title('SVD')

axs[3].scatter(pca_iris[:, 0], pca_iris[:, 1], c=iris[1])
axs[3].set_title('PCA from Sklearn')

###   Concluzia in baza graficilor
     Dupa parerea mea, ma gandesc ca dupa utilizatrea toturor algoritme de PCA si amplasarea lor pe grafic, acesta arata cat de comun sunt datele intre ele, ca corelatie au. 
     Si aici dupa cum vad, clasul Setoasa are ceea mai mica corelatie cu toate, cand Versicolour si Virginica sunt practic aproape unu de altul, ce arata ca au o corelatie careva. 
     Astfel Iris-Versicolour si Iris-Virginica sunt mai folositoare din acest dataset.

    Reantrenarea modelului PCA din sklearn cu n_componente 1

In [None]:
pca2 = PCA_sk(n_components=1)
pca2.fit(X=iris[0])
pca2_iris = pca2.transform(X=iris[0])

explained_variance_ratio = pca.explained_variance_ratio_

plt.bar(range(1, 2), explained_variance_ratio, align='center')
plt.title('Raportul de Varianță')
plt.xlabel('Componenta Principală')
plt.ylabel('Raportul de Varianță')
plt.xticks(range(1, 2))
plt.grid(True)
plt.show()

###   Concluzia in baza acestui grafic
    ...