In [None]:
# Librerías para tratamiento de datos
import numpy as np
import pandas as pd
import statsmodels.api as sm
# Procesado y modelado
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
# Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
iris = sm.datasets.get_rdataset('iris')
datos = iris.data
datos

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [None]:
print('Media de cada variable')
datos.select_dtypes(include = np.number).mean()

Media de cada variable


Unnamed: 0,0
Sepal.Length,5.843333
Sepal.Width,3.057333
Petal.Length,3.758
Petal.Width,1.199333


In [None]:
print('Desviación estándar de cada variable')
datos.select_dtypes(include = np.number).std()

Desviación estándar de cada variable


Unnamed: 0,0
Sepal.Length,0.828066
Sepal.Width,0.435866
Petal.Length,1.765298
Petal.Width,0.762238


In [None]:
# Iris
features = ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']
# Separating out the features
x = datos.loc[:, features]
# Separating out the target
y = datos.loc[:, ['Species']]

In [None]:
x.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
y.head()

Unnamed: 0,Species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


In [None]:
escalar = StandardScaler()
datos_escalados = escalar.fit(x)
datos_escalados = escalar.transform(x)
datos_escalados

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

In [None]:
pca_model = PCA(n_components = 3)

In [None]:
pca_model.fit(datos_escalados)

In [None]:
pca_model.components_

array([[ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654],
       [ 0.37741762,  0.92329566,  0.02449161,  0.06694199],
       [ 0.71956635, -0.24438178, -0.14212637, -0.63427274]])

In [None]:
pd.DataFrame(
    data = pca_model.components_,
    columns = x.columns,
    index = ['PC1', 'PC2', 'PC3']
)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
PC1,0.521066,-0.269347,0.580413,0.564857
PC2,0.377418,0.923296,0.024492,0.066942
PC3,0.719566,-0.244382,-0.142126,-0.634273


In [None]:
proyecciones = np.dot(pca_model.components_, scale(x).T)
proyecciones = pd.DataFrame(proyecciones, index = ['PC1', 'PC2', 'PC3'])
proyecciones = proyecciones.transpose().set_index(x.index)
proyecciones

Unnamed: 0,PC1,PC2,PC3
0,-2.264703,0.480027,0.127706
1,-2.080961,-0.674134,0.234609
2,-2.364229,-0.341908,-0.044201
3,-2.299384,-0.597395,-0.091290
4,-2.389842,0.646835,-0.015738
...,...,...,...
145,1.870503,0.386966,-0.256274
146,1.564580,-0.896687,0.026371
147,1.521170,0.269069,-0.180178
148,1.372788,1.011254,-0.933395


In [None]:
conjunto_1 = proyecciones[['PC1', 'PC2']]
conjunto_1.head()

Unnamed: 0,PC1,PC2
0,-2.264703,0.480027
1,-2.080961,-0.674134
2,-2.364229,-0.341908
3,-2.299384,-0.597395
4,-2.389842,0.646835


In [None]:
conjunto_2 = proyecciones[['PC1', 'PC2', 'PC3']]
conjunto_2.head()

Unnamed: 0,PC1,PC2,PC3
0,-2.264703,0.480027,0.127706
1,-2.080961,-0.674134,0.234609
2,-2.364229,-0.341908,-0.044201
3,-2.299384,-0.597395,-0.09129
4,-2.389842,0.646835,-0.015738


In [None]:
conjunto_1 = pd.concat([conjunto_1, datos[['Species']]], axis = 1)
conjunto_1

Unnamed: 0,PC1,PC2,Species
0,-2.264703,0.480027,setosa
1,-2.080961,-0.674134,setosa
2,-2.364229,-0.341908,setosa
3,-2.299384,-0.597395,setosa
4,-2.389842,0.646835,setosa
...,...,...,...
145,1.870503,0.386966,virginica
146,1.564580,-0.896687,virginica
147,1.521170,0.269069,virginica
148,1.372788,1.011254,virginica


In [None]:
conjunto_2 = pd.concat([conjunto_2, datos[['Species']]], axis = 1)
conjunto_2

Unnamed: 0,PC1,PC2,PC3,Species
0,-2.264703,0.480027,0.127706,setosa
1,-2.080961,-0.674134,0.234609,setosa
2,-2.364229,-0.341908,-0.044201,setosa
3,-2.299384,-0.597395,-0.091290,setosa
4,-2.389842,0.646835,-0.015738,setosa
...,...,...,...,...
145,1.870503,0.386966,-0.256274,virginica
146,1.564580,-0.896687,0.026371,virginica
147,1.521170,0.269069,-0.180178,virginica
148,1.372788,1.011254,-0.933395,virginica
