# Ejercicio - KMedias

** Vamos a usar el Movies dataset**

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from category_encoders import OneHotEncoder

pelis = pd.read_csv("data/movies.csv")
pelis = pelis[pelis.genero.notnull()]

# mantenemos el genero de las peliculas como una lista aparte
genero_peliculas = pelis.genero.values
n_generos = len(pelis.genero.unique())



pelis = pelis.drop("genero", axis=1)
pelis = OneHotEncoder().fit_transform(pelis)

pelis.head()

Unnamed: 0,lenguaje_1,lenguaje_2,lenguaje_3,lenguaje_4,lenguaje_5,lenguaje_6,lenguaje_7,lenguaje_8,lenguaje_9,lenguaje_10,...,titulo_1276,titulo_1277,titulo_1278,titulo_-1,presupuesto,popularidad,ventas,duracion,puntuacion,n_votos
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,,8.387519,76578911.0,106.0,5.7,173.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,,0.894647,676525.0,106.0,6.7,13.0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3500000.0,14.56965,28215918.0,91.0,7.0,513.0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,,8.963037,32.0,87.0,6.0,124.0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,12000000.0,9.592265,41205099.0,92.0,6.5,767.0


In [5]:
genero_peliculas

array(['Comedy', 'Drama', 'Comedy', ..., 'Documentary', 'Horror',
       'Horror'], dtype=object)

In [4]:
n_generos

17

Vemos que hay 17 géneros de películas

Imputamos los valores inexistentes

In [31]:
X = Imputer(strategy="mean").fit_transform(pelis)

### Hacer un estimador KMedias con número de clusters igual al número de géneros. Entrenar con el dataset pelis y calcular el cluster asignado a cada película

In [24]:
from sklearn.cluster import KMeans

In [18]:
estimador_kmedias = KMeans(random_state=42,n_clusters=17)

In [40]:
estimador_kmedias.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=17, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

In [43]:
clusters = estimador_kmedias.labels_

In [45]:
estimador_kmedias.predict(X)

array([ 2,  9, 12, ..., 12, 12, 12], dtype=int32)

In [47]:
centroides = estimador_kmedias.cluster_centers_

In [48]:
centroides.shape

(17, 1955)

In [50]:
from sklearn.metrics import euclidean_distances

In [51]:
distancias_centroides = euclidean_distances(centroides)
distancias_centroides

array([[0.00000000e+00, 2.71771071e+08, 2.25865889e+08, 1.50323510e+08,
        8.29526900e+07, 6.18704423e+07, 2.87001051e+08, 1.95993234e+08,
        2.93782303e+08, 2.88032814e+08, 1.71335603e+08, 2.14663953e+08,
        2.53191611e+08, 8.28308518e+07, 2.57114368e+08, 3.22347374e+08,
        1.16750897e+08],
       [2.71771071e+08, 0.00000000e+00, 4.59267825e+07, 1.21635524e+08,
        3.51801794e+08, 2.09905997e+08, 1.86986459e+07, 7.58663936e+07,
        5.50388996e+08, 1.66867862e+07, 1.03873608e+08, 8.67906732e+07,
        1.85901799e+07, 2.30296778e+08, 3.43741746e+07, 5.93830010e+08,
        1.59129785e+08],
       [2.25865889e+08, 4.59267825e+07, 0.00000000e+00, 7.58856244e+07,
        3.05923000e+08, 1.64008142e+08, 6.21774213e+07, 2.99495492e+07,
        5.05598832e+08, 6.21978687e+07, 5.92000751e+07, 5.94063879e+07,
        2.73941367e+07, 1.86282685e+08, 4.12828347e+07, 5.47998546e+08,
        1.13664918e+08],
       [1.50323510e+08, 1.21635524e+08, 7.58856244e+07, 0.000

In [53]:
list(zip(np.argmax(distancias_centroides,axis=1),np.max(distancias_centroides,axis=1)))

[(15, 322347373.7138512),
 (15, 593830009.5284467),
 (15, 547998546.0179787),
 (15, 472198671.14147854),
 (9, 367759438.7087655),
 (15, 384075240.5217519),
 (15, 608568119.5677117),
 (15, 518207865.31450385),
 (6, 567678392.8619801),
 (15, 610196281.2748877),
 (15, 493479269.3852214),
 (15, 534203521.5364116),
 (15, 575239842.4776151),
 (15, 382325484.59848356),
 (15, 579244309.6822143),
 (9, 610196281.2748877),
 (15, 438264788.0133574)]

In [59]:
def resumen_cluster(cluster_id):
    cluster = pelis[clusters==cluster_id]
    resumen_cluster = cluster[pelis.columns].mode().to_dict(orient="records")[0]
    resumen_cluster.update(cluster.mean().to_dict())
    resumen_cluster["cluster_id"] = cluster_id
    return resumen_cluster

def comparar_clusters(*cluster_ids):
    resumenes = []
    for cluster_id in cluster_ids:
        resumenes.append(resumen_cluster(cluster_id))
    return pd.DataFrame(resumenes).set_index("cluster_id").T

### Calcular el coeficiente de silueta

In [66]:
from sklearn.metrics import silhouette_score

In [67]:
silhouette_score(X,clusters)

0.5381929960809795

### Calcular el Indice de Rand ajustado (`adjusted_rand_score`) usando los generos como clusters naturales

In [68]:
from sklearn.metrics import adjusted_rand_score

In [71]:
adjusted_rand_score(genero_peliculas,clusters)

0.005636397211379519