# Flat Clustering (K-Means)
---

## Env Preparation

In [2]:
import os
import sys
os.chdir('/app/')
print(os.getcwd())
from os.path import join as JP
sys.path.append(os.getcwd())
sys.path.append(JP(os.getcwd(),'utils'))
sys.path.append(JP(os.getcwd(),'scripts'))

/app


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

%matplotlib notebook
np.set_printoptions(precision=3)
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
import pickle
from pprint import pprint
from collections import defaultdict

from utils.nlp_utils import preproces
from utils.general import parse_yaml, ensure_directories

from scripts.catalog import (
    Catalog, Document, Corpus,
    load_catalog, load_corpus)

config = parse_yaml('config.yaml')
paths = config['paths']
ensure_directories(paths)

## Data

In [5]:
data = pd.read_csv(JP(paths['data'],'bbc-text-processed.csv')).iloc[:,1:]
data.head()

Unnamed: 0,category,text,lenght,processed
0,tech,tv future in the hands of viewers with home th...,806,tv future hand viewer system plasma high defin...
1,business,worldcom boss left books alone former worldc...,332,left book ebber accuse oversee 11bn fraud acco...
2,sport,tigers wary of farrell gamble leicester say ...,270,wary farrell gamble rush make bid decide switc...
3,sport,yeading face newcastle in fa cup premiership s...,390,yeade face premiership face trip leader yeade ...
4,entertainment,ocean s twelve raids box office ocean s twelve...,287,raid ocean crime go number office chart take w...


## TFIDF

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
EMBED_SIZE = 10000        # TODO: Increase
NUM_CLUSTERS = data['category'].nunique()
WORDS_PER_CLUSTER = None
print(NUM_CLUSTERS)

5


In [8]:
vectorizer = TfidfVectorizer(
    min_df=.05,
    max_df=.9,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    max_features=EMBED_SIZE,
    ngram_range=(1,3),
    lowercase=True,
    stop_words=stopwords.words('english'))

## Moving to a Catalog Format

In [9]:
documents = [Document() for i in range(data.shape[0])]
for d in range(len(documents)):
    documents[d].processed_text = data['processed'][d]

In [10]:
catalog = Catalog()
catalog.documents = documents

In [11]:
_ = catalog.collect_corpus(attr='processed_text', form=list)
tfidf = catalog.to_matrix(
    vectorizer=vectorizer,
    modelname='TFIDF',
    max_docs=None)
print(tfidf.representation.shape)
tfidf.representation.head()

(2225, 461)


Unnamed: 0,able,accept,access,accord,account,accuse,act,action,add,admit,...,week,weekend,win,winner,woman,work,world,write,year,young
0,0.0,0.0,0.0,0.037,0.0,0.0,0.0,0.0,0.027,0.0,...,0.0,0.0,0.0,0.0,0.0,0.029,0.0,0.0,0.038,0.0
1,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.239,0.066,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.115,0.0,0.0,0.0,0.0,0.0,0.0,...,0.093,0.315,0.087,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Clustering

In [12]:
from sklearn.cluster import KMeans

def kmean_clustering(
    data:pd.DataFrame,
    num_clusters:int=4,
    njobs=-1,
    random_state=46):
    '''
    Perform K-Means Algorithm
    Args:
        - model: Trained instance of class Model
        - num_clusters: Number of Clusters to look for

    Returns: 
        - Clustering model instance
    '''
    km = KMeans(
        n_clusters=num_clusters,
        init='k-means++',
        n_init=20,
        max_iter=1000,
        n_jobs=njobs,
        random_state=random_state)
    return km.fit(data)

In [13]:
def words_per_cluster(
    model,
    clusters:KMeans,
    words_per_cluster:int=None):
    '''
    Return the j most important words per cluster
        The k closest to the centroid of that cluster
        Equivalently: the words are the ones most present in the 'fake'
        document represented by the centroid of the cluster

    Args:
        - Clustering model instance
    
    Returns:
        - Dict key='cluster id', value=k_words_closest_to_centroid
    '''
    
    cluster_words = defaultdict(list)
    centroids = clusters.cluster_centers_
    
    # For each centroid
    for i,centroid in enumerate(centroids):
        # Bring K most similar words to centroid i
        closests_words_to_centroid = centroid.argsort()[::-1] 
        for idx in closests_words_to_centroid:
            cluster_words[i].append(model.id2token[idx])
    return cluster_words

In [26]:
def get_centroids(
    model, #:Model
    clusters:KMeans):
    ''' Return the TFIDF representation of the centroids of a KMeans object '''
    centroids = pd.DataFrame(clusters.cluster_centers_, columns=model.representation.columns)
    centroids.index = range(1,clusters.cluster_centers_.shape[0]+1)
    return centroids

In [15]:
clusters = kmean_clustering(data=tfidf.representation, njobs=1, num_clusters=NUM_CLUSTERS)
BEST_WORDS_PER_CLUSTER = words_per_cluster(tfidf,clusters)
centroids = get_centroids(tfidf, clusters)

In [16]:
centroids.head()

Unnamed: 0,able,accept,access,accord,account,accuse,act,action,add,admit,...,week,weekend,win,winner,woman,work,world,write,year,young
1,0.01,0.002,0.0,0.003,0.002,0.003,0.003,0.014,0.031,0.019,...,0.036,0.017,0.109,0.016,0.014,0.019,0.041,0.003,0.042,0.01
2,0.006,0.005,0.001,0.011,0.003,0.004,0.021,0.013,0.015,0.003,...,0.018,0.02,0.087,0.047,0.021,0.029,0.02,0.025,0.068,0.016
3,0.005,0.006,0.005,0.019,0.018,0.008,0.005,0.011,0.021,0.004,...,0.018,0.003,0.005,0.002,0.001,0.013,0.021,0.003,0.076,0.001
4,0.014,0.008,0.02,0.016,0.008,0.007,0.015,0.014,0.022,0.006,...,0.019,0.003,0.012,0.002,0.018,0.035,0.023,0.015,0.044,0.009
5,0.012,0.009,0.004,0.006,0.004,0.02,0.011,0.013,0.027,0.004,...,0.025,0.006,0.022,0.001,0.013,0.027,0.011,0.01,0.031,0.008


### Format to wordcloud

In [17]:
scores = pd.melt(centroids.rename_axis('cluster').reset_index(), 
                 id_vars=['cluster'], var_name='word', value_name='score')

In [19]:
from sklearn.decomposition import PCA

In [20]:
MIN_K = 2
MAX_K = 4
words_results = defaultdict(lambda: defaultdict())
cluster_results = defaultdict(lambda: defaultdict())

In [27]:
pca = PCA(n_components=50)
data = tfidf.representation
data_low_dim = pca.fit_transform(tfidf.representation.values)

In [32]:
for k in range(MIN_K,MAX_K):
    
    # Compute
    clusters = kmean_clustering(data_low_dim, njobs=-1, num_clusters=k)
    words = words_per_cluster(tfidf,clusters)
    
    data['cluster'] = clusters.labels_
    centroids = data.groupby('cluster').mean().reset_index()
    
    scores = pd.melt(
        centroids, id_vars=['cluster'], var_name='word', value_name='score')
    
    # Store
    words_results[k] = scores
    cluster_results[k] = clusters

In [63]:
list(zip(words_results[3].word, words_results[3].score))

[('able', 0.010451146544103777),
 ('able', 0.005615265393387091),
 ('able', 0.00997712893981503),
 ('accept', 0.007795581546453131),
 ('accept', 0.004716108412330521),
 ('accept', 0.0028093179136741454),
 ('access', 0.013126298192487399),
 ('access', 0.0005034260420869515),
 ('access', 0.0009560588471033211),
 ('accord', 0.015707969548190957),
 ('accord', 0.010807286288521192),
 ('accord', 0.005542174826635738),
 ('account', 0.011276522529219596),
 ('account', 0.002803002086843473),
 ('account', 0.002277995073819684),
 ('accuse', 0.010098846843655036),
 ('accuse', 0.003989093538385637),
 ('accuse', 0.0031990150360230694),
 ('act', 0.009240728905473344),
 ('act', 0.021725449412713344),
 ('act', 0.007836429099662005),
 ('action', 0.013696514613000825),
 ('action', 0.011499932409930561),
 ('action', 0.012934436921361366),
 ('add', 0.021849199098450352),
 ('add', 0.01581804915871252),
 ('add', 0.02940342175411655),
 ('admit', 0.0050213318415649735),
 ('admit', 0.0033734125233273343),
 ('ad

In [47]:
2//2

1

---

# Plotting the WordClouds


In [37]:
%matplotlib inline
from wordcloud import WordCloud

#### Helper Functions for Plotting

In [56]:
def define_subplots(n_cols,n_plots,figsize=None):
    '''Return the axes given a total 
    of plots and desired number of columns'''
    j = 1 if n_plots%n_cols != 0 else 0
    n_rows = (n_plots // n_cols) + j 
    
    print(n_rows, n_cols)
    
    if not figsize: 
        figsize=(n_cols*5,n_rows*5)
    
    fig, axs = plt.subplots(
        nrows=n_rows, ncols=n_cols, sharex=False, sharey=False,
        figsize=figsize)
    
    return fig,axs # .reshape(n_plots,-1)

In [57]:
def cluster_to_wordcloud(
    df, max_words=200, use_mask=False, bgcolor='black'):
    ''' Convert 1 cluster into a WordCloud given:
        - The TFIDF for the cluster
        - The Score Method that give imporance to the word '''
    # Create the wordcloud attending to the inverse of idf
    wordcloud = WordCloud(
        max_words=max_words, 
        mask=mask_ if use_mask else None,
        background_color=bgcolor).generate_from_frequencies(
            frequencies=dict(zip(df.word, df.score)))
    return wordcloud


In [85]:
def plot_centroids_as_wordclouds(
    word_scores,
    NUM_CLUSTERS = None,
    max_words_per_cloud=100, 
    use_mask=False, n_cols=2, figsize=(15,15)):

    if not NUM_CLUSTERS:
        NUM_CLUSTERS = word_scores.cluster.nunique()

    n_plots = NUM_CLUSTERS
    _, axs = define_subplots(n_cols,n_plots, figsize)
    
    for c in range(NUM_CLUSTERS):
        wordcloud = cluster_to_wordcloud(
            df=word_scores[word_scores.cluster == c],
            max_words=max_words_per_cloud,
            use_mask=use_mask)
        
        # Plot the resulting wordcloud
        if len(axs.shape) == 1:
            axs[c].imshow(wordcloud)
            axs[c].axis('off')            
        else:
            axs[c // n_cols, c % n_cols].imshow(wordcloud)
            axs[c // n_cols, c % n_cols].axis('off')
    plt.tight_layout()
    plt.show()
    return


In [None]:
plot_centroids_as_wordclouds(words_results[3], n_cols=2)

2 2
(2, 2)
