<a href="https://colab.research.google.com/github/RafaelCaballero/Julio25/blob/main/code/28kmeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introducción a la ciencia de datos con Python
Rafa Caballero


## 1 K-means (clustering the filas)

Ejemplo de partida

In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv("https://raw.githubusercontent.com/RafaelCaballero/tdm/master/datos/bus.csv")

feat_cols= ["I1","I2","I3","I4","I5","I6","I7","I8"]
df

### En busca de la K

¿Cuál es el número óptimo de grupos?

Primero lo hacemos a mano

In [None]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

suma_de_cuadrados = []
K = range(1,15)
for k in K:
    kmeans = KMeans(n_clusters=k,n_init='auto')
    kmeans.fit_transform(df[feat_cols])
    suma_de_cuadrados.append(kmeans.inertia_)

# dibujamos la gráfica
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(K, suma_de_cuadrados,color="blue")
ax.set_xlabel('Valores de k')
ax.set_ylabel('Suma de distancias al centroide')
ax.grid( which='major', color='gray', linestyle='-')
plt.title('Método del codo')
plt.show()

Ahora con una librería

In [None]:
#!pip install yellowbrick --upgrade

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn import preprocessing
from yellowbrick.cluster import KElbowVisualizer


df2 =  df[feat_cols]  # df2 es df restringido a las variables de interés


# Modelo y visualizador
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,12))

visualizer.fit(df2)      # Fit the data to the visualizer
visualizer.show()

Utilizamos k=3, 3 grupos

In [None]:
k=3
model = KMeans(init='k-means++', n_clusters=k, n_init='auto')
model.fit_transform(df[feat_cols])

df2 = df.copy()
df2["C"] = model.labels_
df2["Hora"] = df["Hora"]

df2[:40]

**Ejercicio** Queremos saber cúantos hay de cada cluster

In [None]:
df2["C"].value_counts()

Mostramos ahora la relación con las horas

In [None]:
import seaborn as sns
sns.catplot(data=df2, y="Hora", hue="C",kind="count",palette="pastel")

In [None]:
sns.histplot(
    data=df2,
    x="Hora", hue="C",
    multiple="fill", stat="proportion",discrete=True,
    palette="pastel"
)


Centros

In [None]:
model.cluster_centers_

**Ejercicio** Repetir con k=2

## 2 Clustering Aglomerativo - Dendogramas

Datos de prueba

In [None]:
import numpy as np
import matplotlib.pyplot as plt


x = [4, 5, 10, 4, 3, 11, 14 , 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

plt.scatter(x, y)
for i in range(len(x)):
    plt.text(x[i]+0.3,y[i]-0.1,str(i))
plt.show()

Clustering bottom-up con dendogramas

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

x = [4, 5, 10, 4, 3, 11, 14 , 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

data = list(zip(x, y))
print(data)

linkage_data = linkage(data, method='ward', metric='euclidean')
r = dendrogram(linkage_data)

plt.show()

Clustering aglomerativo

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
cmap2 = ListedColormap(['steelblue', 'darkorange'])

x = [4, 5, 10, 4, 3, 11, 14 , 6, 10, 12]
y = [21, 19, 24, 17, 16, 25, 24, 22, 21, 21]

data = list(zip(x, y))

hierarchical_cluster = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
labels = hierarchical_cluster.fit_predict(data)
print(labels)
plt.scatter(x, y, c=labels,cmap=cmap2)
plt.show()

Un ejemplo diferente

In [None]:
import numpy as np

x = []
y = []
for v in np.linspace(4, 12, 100):
    x.append(v)
    y.append(16)
    x.append(v)
    y.append(24)
    x.append(4+v/2)
    y.append(10+16/2)
    x.append(4+ v/2)
    y.append(10+24/2)

for v in np.linspace(16, 24, 100):
    x.append(4)
    y.append(v)
    x.append(12)
    y.append(v)
    y.append(10+v/2)
    x.append(6)
    y.append(10+ v/2)
    x.append(10)

plt.scatter(x, y)
plt.show()

In [None]:



data = list(zip(x, y))
k=2
model = KMeans(init='k-means++', n_clusters=k, n_init='auto')
model.fit_transform(data)

plt.scatter(x, y, c=model.labels_, cmap=cmap2)
plt.show()

Con clustering jerárquico

In [None]:
from sklearn.cluster import AgglomerativeClustering
hierarchical_cluster = AgglomerativeClustering(n_clusters=2, metric='manhattan', linkage='single')
labels = hierarchical_cluster.fit_predict(data)
plt.scatter(x, y, c=labels, cmap=cmap2)
plt.show()