<a href="https://colab.research.google.com/github/RafaelCaballero/Julio24/blob/main/31PCA2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introducción a la ciencia de datos con Python
Rafa Caballero


##### PCA como mejora del rendimiento

Vamos a ver la potencia de PCA para mejorar modelos en modelos con muchas columnas. En este caso se trata de imágenes de 6 tipos de objetos.

In [None]:
from keras.datasets import cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

In [None]:
#import numpy as np
#x_train = np.concatenate( [x_train,x_test],axis=0)
#y_train = np.concatenate( [y_train,y_test],axis=0)

In [None]:
import numpy as np
classes = np.unique(y_train)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)

Para relacionar etiquetas con objetos

In [None]:
label_dict = {
 0: 'airplane',
 1: 'automobile',
 2: 'bird',
 3: 'cat',
 4: 'deer',
 5: 'dog',
 6: 'frog',
 7: 'horse',
 8: 'ship',
 9: 'truck',
}

Una imagen al azar...

In [None]:
import random
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(figsize=(10, 5))

i = random.randint(0,len(x_train))
curr_img = np.reshape(x_train[i], (32,32,3))
ax.imshow(curr_img)
plt.title("(Label: " + str(label_dict[y_train[i][0]]) + ")")
plt.show()



Convertimos en Dataframe

In [None]:
import pandas as pd
x_train_flat = x_train.reshape(-1,3072)
feat_cols = ['pixel'+str(i) for i in range(x_train_flat.shape[1])]
df_cifar = pd.DataFrame(x_train_flat,columns=feat_cols)
df_cifar['label'] = y_train
df_cifar

Tenemos 3073 columnas. Vamos a dejarlo en solo unas pocas

In [None]:
from sklearn.decomposition  import PCA
k=15
pca_cifar = PCA(n_components=k)
principalComponents_cifar = pca_cifar.fit_transform(df_cifar.iloc[:,:-1])
principal_cifar_Df = pd.DataFrame(data = principalComponents_cifar
             , columns = ['pc_'+str(i) for i in range(1,k+1)])
principal_cifar_Df['y'] = y_train
principal_cifar_Df

In [None]:
df_cifar.info()

In [None]:
principal_cifar_Df.info()

Una representación gráfica sencilla de las dos componentes principales

In [None]:
import seaborn as sns
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pc_1", y="pc_2",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=principal_cifar_Df,
    legend="full",
    alpha=0.8
)

In [None]:
!pip install ipython-autotime
%load_ext autotime

Prueba sin PCA

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import cohen_kappa_score

df_aux = df_cifar
yColumn="label"
XColumns=[c for c in df_aux.columns if c!=yColumn]
X= df_aux[XColumns]
y=df_aux[yColumn]

steps = [ ("GaussianNB",GaussianNB())]
#steps = [ ("LinearRegression",LinearRegression())]
pipeline = Pipeline(steps)
scorer = make_scorer(cohen_kappa_score)

cv = RepeatedKFold(n_splits=10, n_repeats=1)
scores = cross_val_score(pipeline, X, y, scoring=scorer, cv=cv)
scores.mean()

Ahora con PCA

In [None]:
df_aux = principal_cifar_Df
yColumn="y"
XColumns=[c for c in df_aux.columns if c!=yColumn]
X= df_aux[XColumns]
y=df_aux[yColumn]

steps = [ ("GaussianNB",GaussianNB())]
#steps = [ ("LinearRegression",LinearRegression())]
pipeline = Pipeline(steps)
scorer = make_scorer(cohen_kappa_score)

cv = RepeatedKFold(n_splits=10, n_repeats=1)
scores = cross_val_score(pipeline, X, y, scoring=scorer, cv=cv)
scores.mean()

Es un poco de trampa, habría que meter PCA en el pipeline ¿por qué?

In [None]:
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import cohen_kappa_score

df_aux = df_cifar
yColumn="label"
XColumns=[c for c in df_aux.columns if c!=yColumn]
X= df_aux[XColumns]
y=df_aux[yColumn]

steps = [ ("PCA",PCA(n_components=10)),("GaussianNB",GaussianNB())]
#steps = [ ("LinearRegression",LinearRegression())]
pipeline = Pipeline(steps)
scorer = make_scorer(cohen_kappa_score)

cv = RepeatedKFold(n_splits=10, n_repeats=1)
scores = cross_val_score(pipeline, X, y, scoring=scorer, cv=cv)
scores.mean()

In [None]:
x_test_flat = x_test.reshape(-1,3072)
feat_cols = ['pixel'+str(i) for i in range(x_test_flat.shape[1])]
df_cifar_test = pd.DataFrame(x_test_flat,columns=feat_cols)
df_cifar_test['label'] = y_test
df_cifar_test

In [None]:
# modelo con todo
modelo = pipeline.fit(X,y)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

i = random.randint(0,len(x_test))
curr_img = np.reshape(x_test[i], (32,32,3))
ax.imshow(curr_img)
plt.title("(Label: " + str(label_dict[y_test[i][0]]) + ")")
plt.show


p = modelo.predict(df_cifar_test[XColumns].iloc[i:i+1,:])
print(label_dict[p[0]])

In [None]:
from tqdm import trange
aciertos=0
N=1000
y_real=[]
y_pred=[]
for _ in trange(N):
  i = random.randint(0,len(x_test))
  real = y_test[i][0]
  predicho =  modelo.predict(df_cifar_test[XColumns].iloc[i:i+1,:])[0]
  y_real.append(real)
  y_pred.append(predicho)
  if real==predicho:
    aciertos +=1

print()
print(aciertos/N, 1/6)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

cm = confusion_matrix(y_real, y_pred, labels=modelo.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=label_dict.items())
fig, ax = plt.subplots(figsize=(20, 5))
disp.plot(ax=ax)
plt.xticks(rotation = 90)
plt.show()


print(classification_report(y_real, y_pred))