In [None]:
import pandas as pd
import os
import re
import numpy as np
from numpy import log 
from math import sqrt 

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.cm as cm

#para visualizacion
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Establecemos directorio
dir=os.chdir('C:/Users/rfern/Desktop/Modulo 9/Tarea')

In [None]:
#cargamos matriz tf_idf guardada en formato pickle
df_paginas=pd.read_pickle('df_paginas.pkl')

In [None]:
df_paginas.info()

In [None]:
#instanciamos modelo para vectorizar:
def identity_tokenizer(text):
    return text

tfidf_vect = TfidfVectorizer(tokenizer=identity_tokenizer,
                             lowercase=False,
                            use_idf=True,
                            ngram_range=(1,2))

#ajustamos vectorizador a los datos
tfidf_vect.fit(df_paginas.contenido_limpio)

#implementamos matriz tf-idf
tfidf_data = tfidf_vect.transform(df_paginas.contenido_limpio)

#transformamos a data frame
tfidf_df=pd.DataFrame(tfidf_data.toarray(), columns=tfidf_vect.get_feature_names())

In [None]:
#definimos funcion para identificar el numero optimo de clusters
def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Centros de los clusters')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE según centro de Cluster')
    

#Aquí se definen 20 clusters, utilizando como insumo "tfidf_data"     
find_optimal_clusters(tfidf_data, 20)

In [None]:
#Implementamos modelo de clustering via mini batches
clusters = MiniBatchKMeans(n_clusters=12, init_size=1024, batch_size=2048, random_state=20).fit_predict(tfidf_data)

In [None]:
#Graficamos resultados para PCA y t-SNE
def plot_tsne_pca(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=3000)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
    tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].todense()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('Grafico PCA')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('Grafico TSNE CLustering')
    
plot_tsne_pca(tfidf_data, clusters)

In [None]:
###FUNCIÓN DE RETONRO PALABRAS MAS USADAS EN CADA CLUSTERS

#Obtenemos las 10 palabras mas frencuentes de cada cluster
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i+1))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            
get_top_keywords(tfidf_data, clusters, tfidf_vect.get_feature_names(), 10)