# Principal Component Analysis

PCA란 데이터 간의 차이를 가장 잘 나타내는 요소를 찾는 방법이다.\
주성분 찾기, 데이터 압축(차원 감소), 노이즈 제거 등 다양한 분야로 사용한다.\
PCA는 데이터의 분산을 최대한 보존하면서 서로 직교하는 새 기저를 찾아 \
고차원 공간의 표본들을 선형 연관성이 없는 저차원 공간으로 변환하는 기법이다.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()
iris_pd = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_pd['species'] = iris.target
iris_pd.head()

In [None]:
sns.pairplot(iris_pd, hue='species', height=3, x_vars=['sepal length (cm)', 'petal width (cm)'], y_vars=['petal length (cm)', 'sepal width (cm)'], palette='Accent')

In [None]:
iris_ss = StandardScaler().fit_transform(iris.data)
iris_ss[:3]

In [None]:
def get_pca_data(ss_data, n_components=2) :
    pca = PCA(n_components=n_components)
    pca.fit(ss_data)
    return pca.transform(ss_data), pca

In [None]:
iris_pca, pca = get_pca_data(iris_ss, n_components=2)
iris_pca.shape

In [None]:
pca.mean_

In [None]:
pca.components_

In [None]:
def get_pd_from_pca(pca_data, cols=['pca_component_1', 'pca_component_2']) :
    return pd.DataFrame(pca_data, columns=cols)

In [None]:
iris_pd_pca = get_pd_from_pca(iris_pca)
iris_pd_pca['species'] = iris.target
iris_pd_pca.head(3)

In [None]:
# 네 개의 특성을 두 개로 정리했다.
sns.pairplot(iris_pd_pca, hue='species', height=5, x_vars=['pca_component_1'], y_vars= ['pca_component_2'], palette='Accent')

In [None]:
# 두 개의 축으로 줄여도 전체의 95.8%를 표현할 수 있다.

def print_variance_ratio(pca) :
    print('variance_ratio : ', pca.explained_variance_ratio_)
    print('sum of variance_ratio : ', np.sum(pca.explained_variance_ratio_))

print_variance_ratio(pca)

In [None]:
def rf_scores(x, y, cv=5) :
    rf = RandomForestClassifier(random_state=5, n_estimators=100)
    scores_rf = cross_val_score(rf, x, y, scoring='accuracy', cv=cv)

    print('Score : ', np.mean(scores_rf))

rf_scores(iris_ss, iris.target)

In [None]:
pca_x = iris_pd_pca[['pca_component_1', 'pca_component_2']]
rf_scores(pca_x, iris.target)

# wine

In [None]:
red_url = 'https://raw.githubusercontent.com/PinkWink/forML_study_data/main/data/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/forML_study_data/main/data/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color']=1.
white_wine['color']=0.
wine = pd.concat([red_wine, white_wine])
wine_y = wine['color']
wine_x = wine.drop(['color'], axis=1)
wine_ss = StandardScaler().fit_transform(wine_x)

In [None]:
# 주성분을 두 개로 줄이면 전체 데이터의 50%가 안된다.
pca_wine, pca = get_pca_data(wine_ss, n_components=2)
print_variance_ratio(pca)

In [None]:
pca_columns = ['pca_component_1', 'pca_component_2']
pca_wine_pd = pd.DataFrame(pca_wine, columns=pca_columns)
pca_wine_pd['color'] = wine_y.values
sns.pairplot(pca_wine_pd, hue='color', height=5, x_vars=['pca_component_1'], y_vars=['pca_component_2']);

In [None]:
rf_scores(wine_ss, wine_y)

In [None]:
pca_x = pca_wine_pd[['pca_component_1', 'pca_component_2']]
rf_scores(pca_x, wine_y)

In [None]:
# 주 성분을 세 개로 표현하면 98% 이상을 표현할 수 있다.
pca_wine, pca = get_pca_data(wine_ss, n_components=3)
print_variance_ratio(pca)
cols = ['pca_1', 'pca_2', 'pca_3']
pca_wine_pd = get_pd_from_pca(pca_wine, cols=cols)
pca_x = pca_wine_pd[cols]
rf_scores(pca_x, wine_y)

In [None]:
pca_wine_plot = pca_x
pca_wine_plot['color'] = wine_y.values
pca_wine_plot.head()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

markers = ['^', 'o']

fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection='3d')

for i, marker in enumerate(markers) :
    x_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_1']
    y_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_2']
    z_axis_data = pca_wine_plot[pca_wine_plot['color']==i]['pca_3']

    ax.scatter(x_axis_data, y_axis_data, z_axis_data, s=20, alpha=0.5, marker=marker)

ax.view_init(30, 80)
plt.show()

In [None]:
import plotly.express as px

fig = px.scatter_3d(pca_wine_plot, x='pca_1', y='pca_2', z='pca_3', color='color', symbol='color', opacity=0.4)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()