# Features de TextMining:

---------------------------------

In [1]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pyvis.network import Network
import nltk

import random

!pip install markupsafe==2.0.1

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

### Definindo variáveis e caminhos

In [3]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Funções para execução em batch

In [4]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [5]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors

In [6]:
input_folder_path = r"""C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [7]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 24.5 s


---------------------------------

### Trabalhando no tratamento do texto

In [8]:
import nltk
#import spacy
#import corenlp
#import textblob
#import gensim
#import transformers

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

--------------------------------

Tratando texto:

In [9]:
def text_tokenize(text, language='english', preserve_line=False):
    return nltk.tokenize.word_tokenize(text, language=language, preserve_line=preserve_line)

def clean_text_regex(words_list, regex="[^a-zA-Z]+", replace='', min_word_len=1):
    """Testado em https://regex101.com/"""
    new_words = []
    for word in words_list:
        word = re.sub(regex, replace, word)
        if len(word) > min_word_len:
            new_words.append(word)
    return new_words

def remove_stopwords(words_list, stopwords_list):
    """"""
    new_words = []
    for word in words_list:
        if word not in stopwords_list:
            new_words.append(word)
    return new_words

def lemmatizer(words_list):
    """"""
    obj_lemmatizer = nltk.stem.WordNetLemmatizer()
    words_lemma = []
    for word in words_list:
        words_lemma.append(obj_lemmatizer.lemmatize(word,pos=nltk.corpus.wordnet.VERB))
    return words_lemma

def stem_text(words_list):
    """"""
    p_stem = nltk.stem.PorterStemmer()
    words_stem = []
    for word in words_list:
        words_stem.append(p_stem.stem(word))
    return words_stem

In [14]:
def text_prep(text, clean_text=True, stopwords_remove=True, exec_lemmatizer=True, exec_stem=False, text_lower=False, stopwords_list=[], language='english',
              preserve_line=False, regex_chars_clean="[^a-zA-Z]+", replace_chars_clean='', min_word_len=1):
    
    """Text preparation."""
    
    text_preparation = text_tokenize(text, language=language, preserve_line=preserve_line)
    if clean_text:
        text_preparation = clean_text_regex(words_list=text_preparation,
                                            regex=regex_chars_clean,
                                            replace=replace_chars_clean,
                                            min_word_len=min_word_len)
    if stopwords_remove:
        text_preparation = remove_stopwords(words_list=text_preparation,
                                            stopwords_list=stopwords_list)
    if exec_lemmatizer:
        text_preparation = lemmatizer(words_list=text_preparation)
    if exec_stem:
        text_preparation = stem_text(words_list=text_preparation)
    text_preparation = ' '.join(text_preparation)
    if text_lower:
        text_preparation = text_preparation.lower()
    return text_preparation

In [12]:
def text_prep_column(colum_df):
    """"""
    f_prep_text = lambda text_data: text_prep(text=text_data, clean_text=True, stopwords_remove=True, exec_lemmatizer=True, exec_stem=False, 
                                            stopwords_list=nltk.corpus.stopwords.words('english'), language='english', preserve_line=False,
                                            regex_chars_clean="[^a-zA-Z]+", replace_chars_clean='', min_word_len=1, text_lower=True)
    colum_df = colum_df.apply(lambda e: e if pd.isna(e) else f_prep_text(e))
    return colum_df

In [15]:
dict_dfs['df_doc_info']['acknowledgement_prep'] = text_prep_column(dict_dfs['df_doc_info']['acknowledgement'])
dict_dfs['df_doc_info']['abstract_prep'] = text_prep_column(dict_dfs['df_doc_info']['abstract'])
dict_dfs['df_doc_info']['body_prep'] = text_prep_column(dict_dfs['df_doc_info']['body'])

--------------------------------

Criando BOW e TFIDF:

In [16]:
from text import text_prep, text_mining, text_viz

In [17]:
tmining = text_mining()

In [18]:
documents_abs = dict_dfs['df_doc_info']['abstract_prep'].fillna(' ').tolist()
documents_body = dict_dfs['df_doc_info']['body_prep'].fillna(' ').tolist()

In [19]:
df_tfidf_abstract_abs = tmining.get_df_tfidf(documents_abs)
df_tfidf_abstract_body = tmining.get_df_tfidf(documents_body)

In [20]:
df_bow_abstract_abs = tmining.get_df_bow(documents_abs)
df_bow_abstract_body = tmining.get_df_bow(documents_body)

In [21]:
df_tfidf_abstract_abs.head()

Unnamed: 0,aa,aa aluminum,aare,ab,ab initio,abaqus,abilities,ability,ability feasible,ability gfa,...,zirconia,zn,zn alloy,zn coat,zncl,zone,zone fz,zr,zr hf,zro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df_tfidf_abstract_body.head()

Unnamed: 0,aa,aa aa,aa ab,aa alloy,aa aluminium,aa aluminum,aa bb,aa sample,aa solution,aa vector,...,zunger,zunger pseudopotential,zuo,zuo et,zwick,zwickroell,zx,zy,zz,zz xx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.047131,0.03554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_bow_abstract_abs.head()

Unnamed: 0,aa,aa aluminum,aare,ab,ab initio,abaqus,abilities,ability,ability feasible,ability gfa,...,zirconia,zn,zn alloy,zn coat,zncl,zone,zone fz,zr,zr hf,zro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df_bow_abstract_body.head()

Unnamed: 0,aa,aa aa,aa ab,aa alloy,aa aluminium,aa aluminum,aa bb,aa sample,aa solution,aa vector,...,zunger,zunger pseudopotential,zuo,zuo et,zwick,zwickroell,zx,zy,zz,zz xx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,3,2,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df_bow_abstract_body.shape

(581, 176907)

--------------------------------

Gerando agrupamento simples

In [36]:
import plotly.express as px # for data visualization

def chart(X, y, title_text='UMAP'):
    #--------------------------------------------------------------------------#
    # This section is not mandatory as its purpose is to sort the data by label 
    # so, we can maintain consistent colors for digits across multiple graphs
    
    # Concatenate X and y arrays
    arr_concat=np.concatenate((X, y.reshape(y.shape[0],1)), axis=1)
    # Create a Pandas dataframe using the above array
    df=pd.DataFrame(arr_concat, columns=['x', 'y', 'z', 'label'])
    # Convert label data type from float to integer
    df['label'] = df['label'].astype(int)
    # Finally, sort the dataframe by label
    df.sort_values(by='label', axis=0, ascending=True, inplace=True)
    #--------------------------------------------------------------------------#
    
    # Create a 3D graph
    fig = px.scatter_3d(df, x='x', y='y', z='z', color=df['label'].astype(str), height=500, width=750)

    # Update chart looks
    fig.update_layout(title_text=title_text,
                      showlegend=True,
                      legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5))
                    #   scene_camera=dict(up=dict(x=0, y=0, z=1), 
                    #                         center=dict(x=0, y=0, z=-0.1),
                    #                         eye=dict(x=1.5, y=-1.4, z=0.5)),
                    #                         margin=dict(l=0, r=0, b=0, t=0),
                    #   scene = dict(xaxis=dict(backgroundcolor='white',
                    #                           color='black',
                    #                           gridcolor='#f0f0f0',
                    #                           title_font=dict(size=10),
                    #                           tickfont=dict(size=10),
                    #                          ),
                    #                yaxis=dict(backgroundcolor='white',
                    #                           color='black',
                    #                           gridcolor='#f0f0f0',
                    #                           title_font=dict(size=10),
                    #                           tickfont=dict(size=10),
                    #                           ),
                    #                zaxis=dict(backgroundcolor='lightgrey',
                    #                           color='black', 
                    #                           gridcolor='#f0f0f0',
                    #                           title_font=dict(size=10),
                    #                           tickfont=dict(size=10),
                    #                          )))
    # Update marker size
    fig.update_traces(marker=dict(size=3, line=dict(color='black', width=0.1)))
    
    fig.show()

!pip install umap-learn

In [27]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [28]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

lim_sup = range(2,int(df_tfidf_abstract_abs.shape[0]**0.5))
list_result = []

for c in lim_sup:
    
    objGroup = KMeans(n_clusters=c,
                      init='k-means++',
                      n_init=10,
                      max_iter=30,
                      tol=1e-4, 
                      random_state=0)
    
    objGroup = objGroup.fit(df_tfidf_abstract_abs.values)
    
    inertia = objGroup.inertia_
    s = silhouette_score(df_tfidf_abstract_abs.values, objGroup.labels_, metric='euclidean', random_state=0)
    ch = calinski_harabasz_score(df_tfidf_abstract_abs.values, objGroup.labels_)
    db = davies_bouldin_score(df_tfidf_abstract_abs.values, objGroup.labels_)
    
    list_result.append({'cluster':c,
                        'inertia':inertia,
                        'silhouette':s,
                        'calinski_harabasz':ch,
                        'davies_bouldin':db})

df_metrics = pd.DataFrame(list_result)

ss = df_metrics.nlargest(1,'silhouette')['cluster'].iat[0]
ch = df_metrics.nlargest(1,'calinski_harabasz')['cluster'].iat[0]
db = df_metrics.nsmallest(1,'davies_bouldin')['cluster'].iat[0]

final_cluster_value = int(np.mean([ss, ch, db]))

objGroup = KMeans(n_clusters=final_cluster_value,
                  init='k-means++',
                  n_init=10,
                  max_iter=30,
                  tol=1e-4, 
                  random_state=0)

objGroup = objGroup.fit(df_tfidf_abstract_abs.values)

In [27]:
df_metrics = pd.DataFrame(list_result)
df_metrics

Unnamed: 0,cluster,inertia,silhouette,calinski_harabasz,davies_bouldin
0,2,515.726312,0.06837,29.963738,1.03856
1,3,509.551774,0.059306,18.639207,5.833064
2,4,506.375996,0.048008,13.68867,7.51629
3,5,502.810795,0.052183,11.342417,7.052966
4,6,500.580494,0.05262,9.610912,7.096748
5,7,498.450801,0.051005,8.438073,6.806612
6,8,496.271681,0.028542,7.61117,7.0906
7,9,495.505937,0.01207,6.768919,7.240909
8,10,494.647721,0.021312,6.126796,7.372849
9,11,492.984519,0.000826,5.715333,7.100429


In [30]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


objTSNE = TSNE(n_components=3, init='random')
X_TSNE = objTSNE.fit_transform(df_tfidf_abstract_abs.values)

objPCA = PCA(n_components=3, random_state =0)
X_PCA = objPCA.fit_transform(df_tfidf_abstract_abs.values)

chart(X_PCA, objGroup.labels_)

chart(X_TSNE, objGroup.labels_)

In [32]:
from umap import UMAP

# Configure UMAP hyperparameters
reducer = UMAP(n_neighbors=100, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=3, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=42, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

X_trans = reducer.fit_transform(df_tfidf_abstract_abs.values, objGroup.labels_)

In [37]:
chart(X_trans, objGroup.labels_)

Clutsring Function

In [38]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

In [39]:
dict_dfs['df_doc_head'].head()

Unnamed: 0_level_0,index_head,id_head,unstructured_head,date_head,title_head,book_title_head,series_title_head,journal_head,journal_abbrev_head,publisher_head,...,first_page_head,last_page_head,note_head,doi_head,pmid_head,pmcid_head,arxiv_id_head,ark_head,istex_id_head,url_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,Artificial Neural Network modeling and sensiti...,,,,,,...,,,,,,,,,,
2,,,,,Modelling correlation between alloy compositio...,,,,,,...,,,,10.1179/174328405X36557,,,,,,
3,,,,,"Better, Faster, and Less Biased Machine Learni...",,,,,,...,,,,10.1002/adma.202002425,,,,,,
4,,,,2021-06-17,The utility of composition-based machine learn...,,,,,,...,,,,10.1016/j.commatsci.2021.110637,,,,,,
5,,,,2021-05-12,Machine learning and symbolic regression inves...,,,,,,...,,,,10.1016/j.commatsci.2021.110578,,,,,,


In [51]:
def make_clustering(X,
                    metric_func=np.mean,
                    lim_sup=None, 
                    init='k-means++', 
                    n_init=10, 
                    max_iter=30, 
                    tol=1e-4, 
                    random_state=0):
    
    from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, OPTICS
    from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
    
    lim_sup = range(2,int(X.shape[0]**0.5)) if lim_sup == None else lim_sup
    list_result = []

    for c in lim_sup:
        
        objGroup = KMeans(n_clusters=c,
                          init=init,
                          n_init=n_init,
                          max_iter=max_iter,
                          tol=tol, 
                          random_state=random_state)
        
        objGroup = objGroup.fit(X)
        
        inertia = objGroup.inertia_
        s = silhouette_score(X, objGroup.labels_, metric='euclidean', random_state=random_state)
        ch = calinski_harabasz_score(X, objGroup.labels_)
        db = davies_bouldin_score(X, objGroup.labels_)
        
        list_result.append({'cluster':c,
                            'inertia':inertia,
                            'silhouette':s,
                            'calinski_harabasz':ch,
                            'davies_bouldin':db})

    df_metrics = pd.DataFrame(list_result)

    ss = df_metrics.nlargest(1,'silhouette')['cluster'].iat[0]
    ch = df_metrics.nlargest(1,'calinski_harabasz')['cluster'].iat[0]
    db = df_metrics.nsmallest(1,'davies_bouldin')['cluster'].iat[0]

    final_cluster_value = int(metric_func([ss, ch, db]))

    objGroup = KMeans(n_clusters=final_cluster_value,
                      init='k-means++',
                      n_init=10,
                      max_iter=30,
                      tol=1e-4, 
                      random_state=0)

    objGroup = objGroup.fit(X)
    
    return objGroup.labels_


def reduce_dimensionality(X, y=None, n_components=3):
    
    from umap import UMAP
        
    # from sklearn.manifold import TSNE
    # from sklearn.decomposition import PCA


    # objTSNE = TSNE(n_components=3, init='random')
    # X_TSNE = objTSNE.fit_transform(df_tfidf_abstract_abs.values)

    # objPCA = PCA(n_components=3, random_state =0)
    # X_PCA = objPCA.fit_transform(df_tfidf_abstract_abs.values)
    
    
    # Configure UMAP hyperparameters
    reducer = UMAP(n_neighbors=100, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
                n_components=n_components, # default 2, The dimension of the space to embed into.
                metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
                n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
                learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
                init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
                min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
                spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
                low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
                set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
                local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
                repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
                negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
                transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
                a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
                b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
                random_state=42, # default: None, If int, random_state is the seed used by the random number generator;
                metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
                angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
                target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
                #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
                #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
                #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
                transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
                verbose=False, # default False, Controls verbosity of logging.
                unique=False # default False, Controls if the rows of your data should be uniqued before being embedded. 
                )
    
    X_reduce = reducer.fit_transform(X, y)
    
    return X_reduce

In [65]:
cluster_labels = make_clustering(df_tfidf_abstract_abs.values)
X, y = reduce_dimensionality(df_tfidf_abstract_abs.values, y=cluster_labels, n_components=3), cluster_labels
title_text = "Clustering"

dict_dfs['df_doc_info']['file_name'] = dict_dfs['df_doc_info']['file'].apply(lambda e: os.path.split(e)[-1])
    
# Concatenate X and y arrays
article_title = dict_dfs['df_doc_head']['title_head'].apply(lambda e: ''.join([str(e)[0:20],'...']) if len(str(e)) >= 10 else str(e)).values.reshape(dict_dfs['df_doc_info']['file'].shape[0],1)
file_name = dict_dfs['df_doc_info']['file_name'].values.reshape(dict_dfs['df_doc_info']['file_name'].shape[0],1)

arr_concat=np.concatenate((X,
                           y.reshape(y.shape[0],1),
                           file_name,
                           article_title), axis=1)

# Create a Pandas dataframe using the above array
df=pd.DataFrame(arr_concat, columns=['x', 'y', 'z', 'label', 'file_name', 'title_head'])
# Convert label data type from float to integer
df['label'] = df['label'].astype(int)
# Finally, sort the dataframe by label
df.sort_values(by='label', axis=0, ascending=True, inplace=True)
#--------------------------------------------------------------------------#

# Create a 3D graph
fig = px.scatter_3d(df, 
                    x='x',
                    y='y',
                    z='z',
                    color='label',
                    height=600,
                    width=750,
                    custom_data=['file_name','title_head','label','x','y','z'])
px.scatter_3d()

# Update chart looks
fig.update_layout(title_text=title_text,
                  showlegend=True,
                  legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5))

labels = ["Article File Name: %{customdata[0]}",
          "Article Title: %{customdata[1]}",
          "Grupo: %{customdata[2]}",
          "X: %{x}",
          "Y: %{y}",
          "Z: %{z}"]
            
fig.update_traces(hovertemplate="<br>".join(labels))

fig.update_coloraxes(showscale=False)

# Update marker size
# fig.update_traces(marker=dict(size=3, line=dict(color='black', width=0.1)))

fig.show()

In [66]:
cluster_labels = make_clustering(df_tfidf_abstract_abs.values)
X, y = reduce_dimensionality(df_tfidf_abstract_abs.values, y=cluster_labels, n_components=2), cluster_labels
title_text = "Clustering"

dict_dfs['df_doc_info']['file_name'] = dict_dfs['df_doc_info']['file'].apply(lambda e: os.path.split(e)[-1])
    
# Concatenate X and y arrays
article_title = dict_dfs['df_doc_head']['title_head'].apply(lambda e: ''.join([str(e)[0:20],'...']) if len(str(e)) >= 10 else str(e)).values.reshape(dict_dfs['df_doc_info']['file'].shape[0],1)
file_name = dict_dfs['df_doc_info']['file_name'].values.reshape(dict_dfs['df_doc_info']['file_name'].shape[0],1)

arr_concat=np.concatenate((X,
                           y.reshape(y.shape[0],1),
                           file_name,
                           article_title), axis=1)

# Create a Pandas dataframe using the above array
df=pd.DataFrame(arr_concat, columns=['x', 'y', 'label', 'file_name', 'title_head'])
# Convert label data type from float to integer
df['label'] = df['label'].astype(int)
# Finally, sort the dataframe by label
df.sort_values(by='label', axis=0, ascending=True, inplace=True)
#--------------------------------------------------------------------------#

# Create a 3D graph
fig = px.scatter(df, 
                 x='x',
                 y='y',
                 color='label',
                 height=600,
                 width=750,
                 custom_data=['file_name','title_head','label','x','y'])

# Update chart looks
fig.update_layout(title_text=title_text,
                  showlegend=True,
                  legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5))

labels = ["Article File Name: %{customdata[0]}",
          "Article Title: %{customdata[1]}",
          "Grupo: %{customdata[2]}",
          "X: %{x}",
          "Y: %{y}"]
            
fig.update_traces(hovertemplate="<br>".join(labels))

fig.update_coloraxes(showscale=False)

# Update marker size
fig.update_traces(marker=dict(size=10, line=dict(color='black', width=1)))

fig.show()

In [67]:
px.scatter?

[1;31mSignature:[0m
[0mpx[0m[1;33m.[0m[0mscatter[0m[1;33m([0m[1;33m
[0m    [0mdata_frame[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mx[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0my[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcolor[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msymbol[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msize[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mhover_name[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mhover_data[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcustom_data[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtext[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfacet_row[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfacet_col[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfacet_col_wrap[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0