<a href="https://colab.research.google.com/github/RafaelCaballero/Julio24/blob/main/code/30PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introducción a la ciencia de datos con Python
Rafa Caballero



### Índice
[Carga de datos](#carga)<br>
[PCA para representación gráfica](#pca-graf)<br>
[TSNE](#tsne)<br>
[PCA para reducción de dimensiones](#reduc)<br>



<a name="carga"></a>
### Carga de datos

In [None]:
import pandas as pd
import numpy as np
fich = "https://raw.githubusercontent.com/RafaelCaballero/tdm/master/datos/datoscajas.csv"

df = pd.read_csv(fich)
df

In [None]:
df.label.value_counts()

Un poco de ruido

In [None]:
dfo = df.copy()
# 5 columnas de ruido
for i in range(20):
     df["ruido_"+str(i)] = np.random.randint(0,(i+1)*200,size=len(df))
df

Estandarizar

In [None]:
from sklearn.preprocessing import StandardScaler

def escalar(df):
    scaler = StandardScaler()
    etiqs = ["label"]
    XColumns = [c for c in df.columns if c not in etiqs]

    S = scaler.fit_transform(df[XColumns]) # no se aplica a la etiqueta


    df_s = pd.DataFrame(S,columns=XColumns)
    df_s["label"] = df["label"]
    return df_s

dfo_s = escalar(dfo)
df_s = escalar(df)

<a name="pca-graf"></a>
### PCA para representación gráfica

Convertimos a 2D quedándonos con 2 ejes

In [None]:
from sklearn.decomposition  import PCA
from sklearn.manifold  import TSNE
import numpy as np
from mpl_toolkits.mplot3d.axes3d import Axes3D
from matplotlib import pyplot as plt, colors
import matplotlib
import seaborn as sns

def get_pca(df2, feat_cols, label, n_label):
    df = df2.copy()
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(df[feat_cols].values)
    df['pca-one'] = pca_result[:,0]
    df['pca-two'] = pca_result[:,1]
    df['pca-three'] = pca_result[:,2]
    print('Explained variation per principal component:',pca.explained_variance_ratio_)

    print('Components\n',np.round(pca.components_,3))
    plt.figure(figsize=(10,10))
    sns.scatterplot(
        x="pca-one", y="pca-two",
        hue=label,
        palette=sns.color_palette("hls", n_label),
        data=df,
        legend="full",
        alpha=0.5
    )
    plt.show()

    # en 3 Dim
    fig = plt.figure(figsize=(14,6))

    # projection='3d' indica que este subplot es en 3d
    ax = fig.add_subplot(1, 2, 1, projection='3d')

    ax.scatter(df['pca-one'], df['pca-two'], df['pca-three'], c=df["label"], marker='o')
    ax.set_xlabel('pca-one')
    ax.set_ylabel('pca-two')
    ax.set_zlabel('pca-three')
    plt.show()


    #sns.barplot(data=df,x="pca-one")
    return pca






In [None]:
etiqs = ["label"]
XColumnso = [c for c in dfo_s.columns if c not in etiqs]
pca_data = get_pca(dfo_s,XColumnso,"label",5)

In [None]:

etiqs = ["label"]
XColumns = [c for c in df_s.columns if c not in etiqs]
pca_data = get_pca(df_s,XColumns,"label",5)

<a name="reduc"></a>
### PCA para reducción de dimensiones

Vamos a representar el peso de cada columna para tener idea de cómo influyen los ejes generados

In [None]:
component_labels = ["pca-one","pca-two","pca-three"]
etiqs = ["label"]
XColumns = [c for c in df_s.columns if c not in etiqs]

df_components = pd.DataFrame(pca_data.components_.transpose(),index=XColumns,columns=component_labels)
for c in component_labels:
    print(c)
    fig, ax = plt.subplots(1, 1,figsize=(10,5))
    plt.bar(XColumns,df_components[c].values, label=c)
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    ax.set_title(c)
    plt.show()

<a name="tsne"></a>
### TSNE

Los parámetro de TSNE vienen explicados en la [documentación](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)

In [None]:
from sklearn.manifold  import TSNE

def tsne(df,feat_cols,label,n_etiq):
    df_subset = df.copy()
    data_subset = df_subset[feat_cols].values
    tsne = TSNE(n_components=2,  perplexity=10, n_iter=1000)
    tsne_results = tsne.fit_transform(data_subset)
    df_subset['tsne-2d-one'] = tsne_results[:,0]
    df_subset['tsne-2d-two'] = tsne_results[:,1]
    plt.figure(figsize=(8,5))
    sns.scatterplot(
        x="tsne-2d-one", y="tsne-2d-two",
        hue=label,
        palette=sns.color_palette("hls", n_etiq), # número de valores de la etiqueta
        data=df_subset,
        legend="full",
        alpha=0.3
    )
    return df_subset
df_subset = tsne(df,df.columns[:12],"label",5)

In [None]:
from mpl_toolkits.mplot3d.axes3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib
%matplotlib notebook

def tsne_3d(df,feat_cols):
    df_subset = df.copy()
    data_subset = df_subset[feat_cols].values
    tsne = TSNE(n_components=3,  perplexity=10, n_iter=1000)
    tsne_results = tsne.fit_transform(data_subset)
    df_subset['tsne-3d-one'] = tsne_results[:,0]
    df_subset['tsne-3d-two'] = tsne_results[:,1]
    df_subset['tsne-3d-three'] = tsne_results[:,2]
    # en 3 Dim
    fig = plt.figure(figsize=(14,6))

    # projection='3d' indica que este subplot es en 3d
    ax = fig.add_subplot(1, 2, 1, projection='3d')

    ax.scatter( df_subset['tsne-3d-one'],  df_subset['tsne-3d-two'],  df_subset['tsne-3d-three'], c=df["label"], marker='o')
    ax.set_xlabel('tsne-3d-one')
    ax.set_ylabel('tsne-3d-two')
    ax.set_zlabel('tsne-3d-three')
    plt.show()
    return df_subset
df_subset = tsne_3d(df,df.columns[:12])