# Features de TextMining:

---------------------------------

In [1]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pyvis.network import Network
import nltk

import random

import plotly.graph_objects as go

import networkx as nx

!pip install markupsafe==2.0.1

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

### Definindo variáveis e caminhos

In [3]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Funções para execução em batch

In [4]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [5]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors

In [6]:
input_folder_path = r"""C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [7]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 26.1 s


---------------------------------

### Trabalhando no tratamento do texto

In [8]:
import nltk
#import spacy
#import corenlp
#import textblob
#import gensim
#import transformers

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

--------------------------------

In [9]:
from text import text_prep, text_mining, text_viz

In [10]:
tprep = text_prep()

In [11]:
dict_dfs['df_doc_info']['acknowledgement_prep'] = tprep.text_preparation_column(dict_dfs['df_doc_info']['acknowledgement'])
dict_dfs['df_doc_info']['abstract_prep'] = tprep.text_preparation_column(dict_dfs['df_doc_info']['abstract'])
dict_dfs['df_doc_info']['body_prep'] = tprep.text_preparation_column(dict_dfs['df_doc_info']['body'])

--------------------------------

Criando BOW e TFIDF:

In [12]:
tmining = text_mining()

In [13]:
documents_abs = dict_dfs['df_doc_info']['abstract_prep'].fillna(' ').tolist()
documents_body = dict_dfs['df_doc_info']['body_prep'].fillna(' ').tolist()

In [14]:
df_tfidf_abstract_abs = tmining.get_df_tfidf(documents_abs)
df_tfidf_abstract_body = tmining.get_df_tfidf(documents_body)

In [15]:
df_bow_abstract_abs = tmining.get_df_bow(documents_abs)
df_bow_abstract_body = tmining.get_df_bow(documents_body)

In [16]:
df_tfidf_abstract_abs.head()

Unnamed: 0,aa,aa aluminum,aare,ab,ab initio,abaqus,abilities,ability,ability feasible,ability gfa,...,zirconia,zn,zn alloy,zn coat,zncl,zone,zone fz,zr,zr hf,zro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.064436,0.064436,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_bow_abstract_body.head()

Unnamed: 0,aa,aa aa,aa ab,aa alloy,aa aluminium,aa aluminum,aa bb,aa sample,aa solution,aa vector,...,zunger,zunger pseudopotential,zuo,zuo et,zwick,zwickroell,zx,zy,zz,zz xx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_bow_abstract_body.shape

(581, 176907)

--------------------------------

Gerando agrupamento simples

In [19]:
import plotly.express as px # for data visualization

def chart(X, y, title_text='UMAP'):
    #--------------------------------------------------------------------------#
    # This section is not mandatory as its purpose is to sort the data by label 
    # so, we can maintain consistent colors for digits across multiple graphs
    
    # Concatenate X and y arrays
    arr_concat=np.concatenate((X, y.reshape(y.shape[0],1)), axis=1)
    # Create a Pandas dataframe using the above array
    df=pd.DataFrame(arr_concat, columns=['x', 'y', 'z', 'label'])
    # Convert label data type from float to integer
    df['label'] = df['label'].astype(int)
    # Finally, sort the dataframe by label
    df.sort_values(by='label', axis=0, ascending=True, inplace=True)
    #--------------------------------------------------------------------------#
    
    # Create a 3D graph
    fig = px.scatter_3d(df, x='x', y='y', z='z', color=df['label'].astype(str), height=500, width=750)

    # Update chart looks
    fig.update_layout(title_text=title_text,
                      showlegend=True,
                      legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5))
                    #   scene_camera=dict(up=dict(x=0, y=0, z=1), 
                    #                         center=dict(x=0, y=0, z=-0.1),
                    #                         eye=dict(x=1.5, y=-1.4, z=0.5)),
                    #                         margin=dict(l=0, r=0, b=0, t=0),
                    #   scene = dict(xaxis=dict(backgroundcolor='white',
                    #                           color='black',
                    #                           gridcolor='#f0f0f0',
                    #                           title_font=dict(size=10),
                    #                           tickfont=dict(size=10),
                    #                          ),
                    #                yaxis=dict(backgroundcolor='white',
                    #                           color='black',
                    #                           gridcolor='#f0f0f0',
                    #                           title_font=dict(size=10),
                    #                           tickfont=dict(size=10),
                    #                           ),
                    #                zaxis=dict(backgroundcolor='lightgrey',
                    #                           color='black', 
                    #                           gridcolor='#f0f0f0',
                    #                           title_font=dict(size=10),
                    #                           tickfont=dict(size=10),
                    #                          )))
    # Update marker size
    fig.update_traces(marker=dict(size=3, line=dict(color='black', width=0.1)))
    
    fig.show()

!pip install umap-learn

In [20]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [21]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

lim_sup = range(2,int(df_tfidf_abstract_abs.shape[0]**0.5))
list_result = []

for c in lim_sup:
    
    objGroup = KMeans(n_clusters=c,
                      init='k-means++',
                      n_init=10,
                      max_iter=30,
                      tol=1e-4, 
                      random_state=0)
    
    objGroup = objGroup.fit(df_tfidf_abstract_abs.values)
    
    inertia = objGroup.inertia_
    s = silhouette_score(df_tfidf_abstract_abs.values, objGroup.labels_, metric='euclidean', random_state=0)
    ch = calinski_harabasz_score(df_tfidf_abstract_abs.values, objGroup.labels_)
    db = davies_bouldin_score(df_tfidf_abstract_abs.values, objGroup.labels_)
    
    list_result.append({'cluster':c,
                        'inertia':inertia,
                        'silhouette':s,
                        'calinski_harabasz':ch,
                        'davies_bouldin':db})

df_metrics = pd.DataFrame(list_result)

ss = df_metrics.nlargest(1,'silhouette')['cluster'].iat[0]
ch = df_metrics.nlargest(1,'calinski_harabasz')['cluster'].iat[0]
db = df_metrics.nsmallest(1,'davies_bouldin')['cluster'].iat[0]

final_cluster_value = int(np.mean([ss, ch, db]))

objGroup = KMeans(n_clusters=final_cluster_value,
                  init='k-means++',
                  n_init=10,
                  max_iter=30,
                  tol=1e-4, 
                  random_state=0)

objGroup = objGroup.fit(df_tfidf_abstract_abs.values)

In [22]:
df_metrics = pd.DataFrame(list_result)
df_metrics

Unnamed: 0,cluster,inertia,silhouette,calinski_harabasz,davies_bouldin
0,2,515.726312,0.06837,29.963738,1.03856
1,3,509.570233,0.059454,18.628063,5.790792
2,4,505.69226,0.052113,13.967228,6.57566
3,5,502.827076,0.049956,11.337387,7.033313
4,6,500.280877,0.037313,9.685542,6.829482
5,7,498.766352,0.051391,8.37221,7.0999
6,8,496.419241,0.044946,7.584575,6.925871
7,9,495.555623,0.029973,6.761072,6.992614
8,10,493.913904,0.032074,6.230159,6.628511
9,11,492.927005,0.023742,5.72265,6.616513


In [23]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


objTSNE = TSNE(n_components=3, init='random')
X_TSNE = objTSNE.fit_transform(df_tfidf_abstract_abs.values)

objPCA = PCA(n_components=3, random_state =0)
X_PCA = objPCA.fit_transform(df_tfidf_abstract_abs.values)

chart(X_PCA, objGroup.labels_)

chart(X_TSNE, objGroup.labels_)

In [24]:
from umap import UMAP

# Configure UMAP hyperparameters
reducer = UMAP(n_neighbors=100, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=3, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=42, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

X_trans = reducer.fit_transform(df_tfidf_abstract_abs.values, objGroup.labels_)

In [25]:
chart(X_trans, objGroup.labels_)

Clutsring Function

In [26]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

In [27]:
dict_dfs['df_doc_head'].head()

Unnamed: 0_level_0,index_head,id_head,unstructured_head,date_head,title_head,book_title_head,series_title_head,journal_head,journal_abbrev_head,publisher_head,...,first_page_head,last_page_head,note_head,doi_head,pmid_head,pmcid_head,arxiv_id_head,ark_head,istex_id_head,url_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,Determination of deformation and failure prope...,,,,,,...,,,,10.1016/j.commatsci.2003.08.031,,,,,,
2,,,,,Journal Pre-proof Accelerate design of the mag...,,,,,,...,,,,10.1016/j.jmst.2021.03.082,,,,,,
3,,,,2020-11-09,Toward design of cation transport in solid-sta...,,,,,,...,,,,10.1016/j.cossms.2020.100875,,,,,,
4,,,,2021-03-02,Generalized stacking fault energies and Peierl...,,,,,,...,,,,10.1016/j.commatsci.2021.110364,,,,,,
5,,,,2021-09-22,Unsupervised segmentation of microstructural i...,,,,,,...,,,,10.1016/j.commatsci.2021.110855,,,,,,


In [28]:
def make_clustering(X,
                    metric_func=np.mean,
                    lim_sup=None, 
                    init='k-means++', 
                    n_init=10, 
                    max_iter=30, 
                    tol=1e-4, 
                    random_state=0):
    
    from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, OPTICS
    from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
    
    lim_sup = range(2,int(X.shape[0]**0.5)) if lim_sup == None else lim_sup
    list_result = []

    for c in lim_sup:
        
        objGroup = KMeans(n_clusters=c,
                          init=init,
                          n_init=n_init,
                          max_iter=max_iter,
                          tol=tol, 
                          random_state=random_state)
        
        objGroup = objGroup.fit(X)
        
        inertia = objGroup.inertia_
        s = silhouette_score(X, objGroup.labels_, metric='euclidean', random_state=random_state)
        ch = calinski_harabasz_score(X, objGroup.labels_)
        db = davies_bouldin_score(X, objGroup.labels_)
        
        list_result.append({'cluster':c,
                            'inertia':inertia,
                            'silhouette':s,
                            'calinski_harabasz':ch,
                            'davies_bouldin':db})

    df_metrics = pd.DataFrame(list_result)

    ss = df_metrics.nlargest(1,'silhouette')['cluster'].iat[0]
    ch = df_metrics.nlargest(1,'calinski_harabasz')['cluster'].iat[0]
    db = df_metrics.nsmallest(1,'davies_bouldin')['cluster'].iat[0]

    final_cluster_value = int(metric_func([ss, ch, db]))

    objGroup = KMeans(n_clusters=final_cluster_value,
                      init='k-means++',
                      n_init=10,
                      max_iter=30,
                      tol=1e-4, 
                      random_state=0)

    objGroup = objGroup.fit(X)
    
    return objGroup.labels_


def reduce_dimensionality(X, y=None, n_components=3, algorithm='UMAP'):
    
    """This function get the X data and reduce dimensionality to n_components.
    
    algorithm: UMAP, TSNE, PCA
    
    """
    
    from umap import UMAP
    from sklearn.manifold import TSNE
    from sklearn.decomposition import PCA

    if algorithm=="TSNE":
        objTSNE = TSNE(n_components=3, init='random')
        X_reduce = objTSNE.fit_transform(df_tfidf_abstract_abs.values)
    elif algorithm=="PCA":
        objPCA = PCA(n_components=3, random_state =0)
        X_reduce = objPCA.fit_transform(df_tfidf_abstract_abs.values)
    else:
        # Configure UMAP hyperparameters
        reducer = UMAP(n_neighbors=100, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
                    n_components=n_components, # default 2, The dimension of the space to embed into.
                    metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
                    n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
                    learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
                    init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
                    min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
                    spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
                    low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
                    set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
                    local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
                    repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
                    negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
                    transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
                    a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
                    b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
                    random_state=42, # default: None, If int, random_state is the seed used by the random number generator;
                    metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
                    angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
                    target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
                    #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
                    #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
                    #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
                    transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
                    verbose=False, # default False, Controls verbosity of logging.
                    unique=False # default False, Controls if the rows of your data should be uniqued before being embedded. 
                    )
        
        X_reduce = reducer.fit_transform(X, y)
    
    return X_reduce

In [29]:
cluster_labels = make_clustering(df_tfidf_abstract_abs.values)
X, y = reduce_dimensionality(df_tfidf_abstract_abs.values, y=cluster_labels, n_components=3), cluster_labels
title_text = "Group Articles"

dict_dfs['df_doc_info']['file_name'] = dict_dfs['df_doc_info']['file'].apply(lambda e: os.path.split(e)[-1])
    
# Concatenate X and y arrays
article_title = dict_dfs['df_doc_head']['title_head'].apply(lambda e: ''.join([str(e)[0:20],'...']) if len(str(e)) >= 10 else str(e)).values.reshape(dict_dfs['df_doc_info']['file'].shape[0],1)
file_name = dict_dfs['df_doc_info']['file_name'].values.reshape(dict_dfs['df_doc_info']['file_name'].shape[0],1)

arr_concat=np.concatenate((X,
                           y.reshape(y.shape[0],1),
                           file_name,
                           article_title), axis=1)

# Create a Pandas dataframe using the above array
df=pd.DataFrame(arr_concat, columns=['x', 'y', 'z', 'label', 'file_name', 'title_head'])
# Convert label data type from float to integer
df['label'] = df['label'].astype(int)
# Finally, sort the dataframe by label
df.sort_values(by='label', axis=0, ascending=True, inplace=True)
#--------------------------------------------------------------------------#

# Create a 3D graph
fig = px.scatter_3d(df, 
                    x='x',
                    y='y',
                    z='z',
                    color='label',
                    height=600,
                    width=750,
                    custom_data=['file_name','title_head','label','x','y','z'])
px.scatter_3d()

# Update chart looks
fig.update_layout(title_text=title_text,
                  showlegend=True,
                  legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5))

labels = ["Article File Name: %{customdata[0]}",
          "Article Title: %{customdata[1]}",
          "Grupo: %{customdata[2]}",
          "X: %{x}",
          "Y: %{y}",
          "Z: %{z}"]
            
fig.update_traces(hovertemplate="<br>".join(labels))

fig.update_coloraxes(showscale=False)

# Update marker size
# fig.update_traces(marker=dict(size=3, line=dict(color='black', width=0.1)))

fig.show()

In [30]:
cluster_labels = make_clustering(df_tfidf_abstract_abs.values)
X, y = reduce_dimensionality(df_tfidf_abstract_abs.values, y=cluster_labels, n_components=2), cluster_labels
title_text = "Group Articles"

dict_dfs['df_doc_info']['file_name'] = dict_dfs['df_doc_info']['file'].apply(lambda e: os.path.split(e)[-1])
    
# Concatenate X and y arrays
article_title = dict_dfs['df_doc_head']['title_head'].apply(lambda e: ''.join([str(e)[0:20],'...']) if len(str(e)) >= 10 else str(e)).values.reshape(dict_dfs['df_doc_info']['file'].shape[0],1)
file_name = dict_dfs['df_doc_info']['file_name'].values.reshape(dict_dfs['df_doc_info']['file_name'].shape[0],1)

arr_concat=np.concatenate((X,
                           y.reshape(y.shape[0],1),
                           file_name,
                           article_title), axis=1)

# Create a Pandas dataframe using the above array
df=pd.DataFrame(arr_concat, columns=['x', 'y', 'label', 'file_name', 'title_head'])
# Convert label data type from float to integer
df['label'] = df['label'].astype(int)
# Finally, sort the dataframe by label
df.sort_values(by='label', axis=0, ascending=True, inplace=True)
#--------------------------------------------------------------------------#

# Create a 3D graph
fig = px.scatter(df, 
                 x='x',
                 y='y',
                 color='label',
                 height=600,
                 width=750,
                 custom_data=['file_name','title_head','label','x','y'])

# Update chart looks
fig.update_layout(title_text=title_text,
                  showlegend=True,
                  legend=dict(orientation="h", yanchor="top", y=0, xanchor="center", x=0.5))

labels = ["Article File Name: %{customdata[0]}",
          "Article Title: %{customdata[1]}",
          "Grupo: %{customdata[2]}",
          "X: %{x}",
          "Y: %{y}"]
            
fig.update_traces(hovertemplate="<br>".join(labels))

fig.update_coloraxes(showscale=False)

# Update marker size
fig.update_traces(marker=dict(size=10, line=dict(color='black', width=1)))

fig.show()

In [31]:
px.scatter?

[1;31mSignature:[0m
[0mpx[0m[1;33m.[0m[0mscatter[0m[1;33m([0m[1;33m
[0m    [0mdata_frame[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mx[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0my[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcolor[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msymbol[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msize[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mhover_name[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mhover_data[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcustom_data[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtext[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfacet_row[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfacet_col[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfacet_col_wrap[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0

In [32]:
dict_dfs['df_doc_head'].head()

Unnamed: 0_level_0,index_head,id_head,unstructured_head,date_head,title_head,book_title_head,series_title_head,journal_head,journal_abbrev_head,publisher_head,...,first_page_head,last_page_head,note_head,doi_head,pmid_head,pmcid_head,arxiv_id_head,ark_head,istex_id_head,url_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,Determination of deformation and failure prope...,,,,,,...,,,,10.1016/j.commatsci.2003.08.031,,,,,,
2,,,,,Journal Pre-proof Accelerate design of the mag...,,,,,,...,,,,10.1016/j.jmst.2021.03.082,,,,,,
3,,,,2020-11-09,Toward design of cation transport in solid-sta...,,,,,,...,,,,10.1016/j.cossms.2020.100875,,,,,,
4,,,,2021-03-02,Generalized stacking fault energies and Peierl...,,,,,,...,,,,10.1016/j.commatsci.2021.110364,,,,,,
5,,,,2021-09-22,Unsupervised segmentation of microstructural i...,,,,,,...,,,,10.1016/j.commatsci.2021.110855,,,,,,


In [33]:
100*dict_dfs['df_doc_head'].isnull().sum()/dict_dfs['df_doc_head'].shape[0]

index_head             100.000000
id_head                100.000000
unstructured_head      100.000000
date_head               53.700516
title_head               1.721170
book_title_head        100.000000
series_title_head      100.000000
journal_head           100.000000
journal_abbrev_head    100.000000
publisher_head         100.000000
institution_head       100.000000
issn_head               99.311532
eissn_head             100.000000
volume_head            100.000000
issue_head             100.000000
pages_head             100.000000
first_page_head        100.000000
last_page_head         100.000000
note_head              100.000000
doi_head                 2.581756
pmid_head              100.000000
pmcid_head             100.000000
arxiv_id_head           99.827883
ark_head               100.000000
istex_id_head          100.000000
url_head               100.000000
dtype: float64

In [34]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

In [35]:
100*dict_dfs['df_doc_info'].isnull().sum()/dict_dfs['df_doc_info'].shape[0]

grobid_version            0.000000
grobid_timestamp          0.000000
pdf_md5                   0.000000
language_code             0.000000
acknowledgement          21.686747
abstract                  2.925990
body                      0.000000
annex                   100.000000
file                      0.000000
status                    0.000000
raw_data                  0.000000
acknowledgement_prep     21.686747
abstract_prep             2.925990
body_prep                 0.000000
file_name                 0.000000
dtype: float64

In [36]:
100*dict_dfs['df_doc_authors'].isnull().sum()/dict_dfs['df_doc_authors'].shape[0]

full_name_author       0.298285
given_name_author      1.677852
middle_name_author    75.913497
surname_author         0.298285
email_author          78.560776
orcid_author          96.905295
institution_author    17.337808
department_author     27.926920
laboratory_author     86.465324
addr_line_author      79.791201
post_code_author      33.146905
settlement_author     21.774795
country_author        14.839672
dtype: float64

In [37]:
100*dict_dfs['df_doc_authors_citations'].isnull().sum()/dict_dfs['df_doc_authors_citations'].shape[0]

id                        0.000000
index                     0.000000
full_name_citation        0.628332
given_name_citation       1.660241
middle_name_citation     69.106718
surname_citation          1.340333
email_citation          100.000000
orcid_citation          100.000000
institution_citation    100.000000
department_citation     100.000000
laboratory_citation     100.000000
addr_line_citation      100.000000
post_code_citation      100.000000
settlement_citation     100.000000
country_citation        100.000000
dtype: float64

In [38]:
dict_dfs['df_doc_head']

Unnamed: 0_level_0,index_head,id_head,unstructured_head,date_head,title_head,book_title_head,series_title_head,journal_head,journal_abbrev_head,publisher_head,...,first_page_head,last_page_head,note_head,doi_head,pmid_head,pmcid_head,arxiv_id_head,ark_head,istex_id_head,url_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,Determination of deformation and failure prope...,,,,,,...,,,,10.1016/j.commatsci.2003.08.031,,,,,,
2,,,,,Journal Pre-proof Accelerate design of the mag...,,,,,,...,,,,10.1016/j.jmst.2021.03.082,,,,,,
3,,,,2020-11-09,Toward design of cation transport in solid-sta...,,,,,,...,,,,10.1016/j.cossms.2020.100875,,,,,,
4,,,,2021-03-02,Generalized stacking fault energies and Peierl...,,,,,,...,,,,10.1016/j.commatsci.2021.110364,,,,,,
5,,,,2021-09-22,Unsupervised segmentation of microstructural i...,,,,,,...,,,,10.1016/j.commatsci.2021.110855,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,,,,,Machine learning as a contributor to physics: ...,,,,,,...,,,,10.1016/j.matdes.2019.107759,,,,,,
584,,,,,Journal Pre-proof Alloy design and properties ...,,,,,,...,,,,10.1016/j.msea.2020.140576,,,,,,
585,,,,,The yield and ultimate tensile strength of ste...,,,,,,...,,,,,,,,,,
586,,,,,Artificial Neural Network modeling and sensiti...,,,,,,...,,,,,,,,,,


In [39]:
100*dict_dfs['df_doc_authors'].isnull().sum()/dict_dfs['df_doc_authors'].shape[0]

full_name_author       0.298285
given_name_author      1.677852
middle_name_author    75.913497
surname_author         0.298285
email_author          78.560776
orcid_author          96.905295
institution_author    17.337808
department_author     27.926920
laboratory_author     86.465324
addr_line_author      79.791201
post_code_author      33.146905
settlement_author     21.774795
country_author        14.839672
dtype: float64

---------------------------------------------

Plotly Network Graph with Plotly and NetworkX

In [40]:
class graph_plotly_networkx(object):
    
    import plotly.graph_objects as go
    import networkx as nx
    
    """This class implements methods and functions for draw a graph-network."""

    def __init__(self):
        pass


    def plot_add_edges(self, G):
        
        """Add edges one by one."""
        
        list_edge_traces = []
        for edge in G.edges():

            edge_x = []
            edge_y = []
            
            x0, y0 = G.nodes[edge[0]]['pos']
            x1, y1 = G.nodes[edge[1]]['pos']
            edge_x.append(x0)
            edge_x.append(x1)
            edge_x.append(None)
            edge_y.append(y0)
            edge_y.append(y1)
            edge_y.append(None)

            edge_trace = go.Scatter(x=edge_x, y=edge_y,
                                    line=dict(width=0.5, # G.edge_size, # 0.5
                                              color='#888'), # G.edge_color), # '#888'),
                                    hoverinfo='none',
                                    mode='lines',
                                    line_shape='spline')
            
            list_edge_traces.append(edge_trace)
            
        return list_edge_traces


    def plot_add_all_edges(self, G, color='#888', width=0.5):
        
        """Add all edges."""
        
        edge_x = []
        edge_y = []    
        
        for edge in G.edges():
            
            x0, y0 = G.nodes[edge[0]]['pos']
            x1, y1 = G.nodes[edge[1]]['pos']
            edge_x.append(x0)
            edge_x.append(x1)
            edge_x.append(None)
            edge_y.append(y0)
            edge_y.append(y1)
            edge_y.append(None)

        edge_trace = go.Scatter(x=edge_x, y=edge_y,
                                line=dict(width=width,
                                          color=color),
                                hoverinfo='none',
                                mode='lines',
                                line_shape='spline')
        return edge_trace


    def plot_add_nodes(self, G):
        
        list_nodes_traces = []
        for node in G.nodes():
            
            node_x = []
            node_y = []
            
            x, y = G.nodes[node]['pos']
            node_x.append(x)
            node_y.append(y)

            node_trace = go.Scatter(x=node_x,
                                    y=node_y,
                                    mode='markers',
                                    hoverinfo='text',
                                    marker=dict(showscale=True,
                                                # colorscale options
                                                #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
                                                #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
                                                #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
                                                colorscale='YlGnBu',
                                                reversescale=True,
                                                color=[],
                                                size=10,
                                                colorbar=dict(thickness=15,
                                                            title='Node Connections',
                                                            xanchor='left',
                                                            titleside='right'),
                                                            line_width=2))
            
            
            list_nodes_traces.append(node_trace)
        return list_nodes_traces


    def plot_add_all_nodes(self, G, size_list, color_list, text, opacity, colorbar_title="Node Connections"):
        
        node_x = []
        node_y = []
        
        for node in G.nodes():
            x, y = G.nodes[node]['pos']
            node_x.append(x)
            node_y.append(y)

        # https://plotly.com/python-api-reference/generated/plotly.graph_objects.scatter.html#plotly.graph_objects.scatter.Marker
        
        node_trace = go.Scatter(x=node_x,
                                y=node_y,
                                mode='markers', # text, 
                                hoverinfo='text',
                                opacity=opacity,
                                marker=dict(showscale=True,
                                            # colorscale options
                                            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
                                            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
                                            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
                                            colorscale='Bluered',
                                            reversescale=True,
                                            color=color_list,
                                            size=size_list,
                                            colorbar=dict(thickness=15,
                                                          title=colorbar_title,
                                                          xanchor='left',
                                                          titleside='right'),
                                                          line_width=2))
        node_trace.marker.color = color_list
        node_trace.text = text
        
        return node_trace


    def plot_add_all_text(self, G, size_list, color_list, text, opacity):
        
        node_x = []
        node_y = []
        
        for node in G.nodes():
            x, y = G.nodes[node]['pos']
            node_x.append(x)
            node_y.append(y)

        node_trace = go.Scatter(x=node_x,
                                y=node_y,
                                mode='text', # text, 
                                hoverinfo='text',
                                text=text,
                                opacity=opacity,
                                marker=dict(showscale=True,
                                            # colorscale options
                                            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
                                            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
                                            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
                                            colorscale='Rainbow',
                                            reversescale=False,
                                            color=color_list,
                                            size=size_list))
        node_trace.marker.color = color_list
        node_trace.text = text
        
        return node_trace

Create random graph

In [43]:
G = nx.random_geometric_graph(200, 0.125)

gpn = graph_plotly_networkx()

node_adjacencies = []
node_text = []
node_size = []
opacity_list = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: '+str(len(adjacencies[1])))
    node_size.append(2*len(adjacencies[1]))

edge_traces = gpn.plot_add_all_edges(G)
node_trace = gpn.plot_add_all_nodes(G, node_size, node_adjacencies, node_text, 1)
text_trace = gpn.plot_add_all_text(G, node_size, node_adjacencies, len(node_size)*['Testing Words'], 1)

Create Network Graph

In [44]:
fig = go.Figure(data=[edge_traces, node_trace, text_trace],
                layout=go.Layout(
                    title='<br>Network Graph',
                    titlefont_size=12,
                    height=750,
                    width=None,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=1,l=1,r=1,t=1), # b=20,l=5,r=5,t=40
                    annotations=[dict(text="Graph <a href=''>Link</a>",
                                      showarrow=False,
                                      xref="paper", yref="paper",
                                      x=0.005,
                                      y=-0.002)],
                    xaxis=dict(showgrid=False,
                               zeroline=False,
                               showticklabels=False),
                    yaxis=dict(showgrid=False,
                               zeroline=False,
                               showticklabels=False),)
                )
fig.show()