# Usando processo bath em um conjunto maior de artigos - Cliente do GROBID:

---------------------------------

### Importando depend√™ncias

In [2]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

!pip install nltk --upgrade

!pip install gensim --upgrade

!pip install spacy --upgrade

!pip install stanford-corenlp --upgrade

!pip install corenlp --upgrade

!pip install textblob --upgrade

!pip install transformers --upgrade

In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

pip install grobid-tei-xml

### Definindo vari√°veis e caminhos

In [4]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Fun√ß√µes para execu√ß√£o em batch

In [5]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [6]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors


def tokenize_data(df_colum):
    """"""
        

In [7]:
input_folder_path = r"""C:\Users\vierb\OneDrive\√Årea de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [8]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 26.7 s


In [9]:
print(dict_exec.keys())

dict_keys(['path', 'start_datetime', 'grobid_config', 'files', 'num_files', 'n_workers', 'end_datetime', 'time_exec_sec', 'time_exec_min'])


In [10]:
dic_errors

{'number_article_error': 6,
 'list_article_error': [{'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\advs.201903667.pdf',
   'error': ValueError,
   'error_text': 'If using all scalar values, you must pass an index',
   'keys_dict': dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'body'])},
  {'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\S1006-706X(14)60038-8.pdf',
   'error': ValueError,
   'error_text': 'If using all scalar values, you must pass an index',
   'keys_dict': dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'body'])},
  {'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\j.commatsci.2020.109782.pdf',
   'error': xml.etree.ElementTree.ParseError,
   'error_text': 'syntax error: line 1, column 0',
   'ke

In [11]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,8A005E0AC489DFE9B5527FC54743A71B,39ED4EE32409D1D19F117A05FE91A6C0,1F79E1760A42B6D9E6F31886E30D5086
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:08:00,2022-05-15 02:10:00,2022-05-15 02:11:00
language_code,en,en,en
acknowledgement,Acknowledgements The build used as a case stud...,Acknowledgement The authors would like to appr...,
abstract,In-situ detection of processing defects is a c...,"At the temperature raging from 700 to 950 ‚Ä¢ C,...",Alloy design and properties optimization of mu...
body,Introduction Many of the applications best sui...,Introduction Nitrogen alloyed stainless steels...,"Introduction Multi-component alloy, or high en..."
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [12]:
dict_dfs['df_doc_head'].head(3).T

pdf_md5,8A005E0AC489DFE9B5527FC54743A71B,39ED4EE32409D1D19F117A05FE91A6C0,1F79E1760A42B6D9E6F31886E30D5086
index_head,,,
id_head,,,
unstructured_head,,,
date_head,,,
title_head,A Multi-scale Convolutional Neural Network for...,Numerical simulation of Cr 2 N age-precipitati...,Journal Pre-proof Alloy design and properties ...
book_title_head,,,
series_title_head,,,
journal_head,,,
journal_abbrev_head,,,
publisher_head,,,


In [13]:
dict_dfs['df_doc_authors'].head(3).T

pdf_md5,8A005E0AC489DFE9B5527FC54743A71B,8A005E0AC489DFE9B5527FC54743A71B.1,39ED4EE32409D1D19F117A05FE91A6C0
full_name_author,Luke Scime,Jack Beuth,Q X Dai
given_name_author,Luke,Jack,Q
middle_name_author,,,X
surname_author,Scime,Beuth,Dai
email_author,lscime@alumni.cmu.edu,,qxdai@ujs.edu.cn
orcid_author,,,
institution_author,Carnegie Mellon University,Carnegie Mellon University,Jiangsu University
department_author,Department of a Mechanical Engineering,Department of a Mechanical Engineering,School of Materials Science and Engineering
laboratory_author,NextManufacturing Center,NextManufacturing Center,
addr_line_author,5000 Forbes Ave,5000 Forbes Ave,


In [14]:
dict_dfs['df_doc_citations'].head(5)

Unnamed: 0_level_0,index_citation,id_citation,unstructured_citation,date_citation,title_citation,book_title_citation,series_title_citation,journal_citation,journal_abbrev_citation,publisher_citation,...,first_page_citation,last_page_citation,note_citation,doi_citation,pmid_citation,pmcid_citation,arxiv_id_citation,ark_citation,istex_id_citation,url_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8A005E0AC489DFE9B5527FC54743A71B,0,b0,"K.M. Taminger, R.A. Hafley, Electron beam free...",2006,Electron beam freeform fabrication for cost ef...,,,,,,...,,,,,,,,,,
8A005E0AC489DFE9B5527FC54743A71B,1,b1,"T. Wohlers, T. Caffrey, Wohlers Report, Fort C...",2011,,,,,,,...,,,,,,,,,,
8A005E0AC489DFE9B5527FC54743A71B,2,b2,"C.A. Giffi, B. Gangula, P. Illinda, 3D opportu...",2014,3D opportunity for the automotive industry: Ad...,,,,,,...,,,,,,,,,,http://dupress.com/articles/additive-manufactu...
8A005E0AC489DFE9B5527FC54743A71B,3,b3,"D. Bourell, M. Leu, D. Rosen, Roadmap for addi...",2009-05-23,Roadmap for additive manufacturing: Identifyin...,,,,,,...,,,Solid Free. Fabr. Proc.,,,,,,,http://wohlersassociates.com/roadmap2009A.pdf
8A005E0AC489DFE9B5527FC54743A71B,4,b4,"M. Grasso, B.M. Colosimo, Process defects and ...",2017,Process defects and in situ monitoring methods...,,,Meas. Sci. Technol,,,...,,,,10.1088/1361-6501/aa5c4f,,,,,,


In [15]:
dict_dfs['df_doc_authors_citations'].head(5)

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation,email_citation,orcid_citation,institution_citation,department_citation,laboratory_citation,addr_line_citation,post_code_citation,settlement_citation,country_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8A005E0AC489DFE9B5527FC54743A71B,b0,0,K M Taminger,K,M,Taminger,,,,,,,,,
8A005E0AC489DFE9B5527FC54743A71B,b0,0,R A Hafley,R,A,Hafley,,,,,,,,,
8A005E0AC489DFE9B5527FC54743A71B,b1,1,T Wohlers,T,,Wohlers,,,,,,,,,
8A005E0AC489DFE9B5527FC54743A71B,b1,1,T Caffrey,T,,Caffrey,,,,,,,,,
8A005E0AC489DFE9B5527FC54743A71B,b1,1,Wohlers Report,Wohlers,,Report,,,,,,,,,


---------------------------------

### Trabalhando no tratamento do texto

!pip install pytorch --upgrade

!pip install tensorflow --upgrade --user

In [16]:
import nltk
import spacy
import corenlp
import textblob
import gensim
#import transformers

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [55]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [57]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

--------------------------------

Estudando a distribui√ß√£o de caracteres especiais:

In [17]:
list_chars = []
for id,row in dict_dfs['df_doc_info'].iterrows():
    for c in row['raw_data']:
        list_chars.append(c)
        
df_counts = pd.DataFrame({'chars':pd.value_counts(list_chars).index.tolist(),'counts':pd.value_counts(list_chars).tolist()})
df_counts = df_counts.sort_values(by='counts',ascending=False)

import plotly.express as px
fig = px.bar(df_counts.head(20), x='chars', y='counts')
fig.show()

In [18]:
df_counts.sort_values(by='counts',ascending=False).tail(30)

Unnamed: 0,chars,counts
741,ùüï,1
740,ùë∞,1
739,ùê®,1
742,–∏,1
749,‚î¨,1
760,‚áë,1
726,‚Ö•,1
750,‚ë•,1
752,·âÄ,1
753,·âÅ,1


--------------------------------

Tratando texto:

In [27]:
def text_tokenize(text, language='english', preserve_line=False):
    return nltk.tokenize.word_tokenize(text, language=language, preserve_line=preserve_line)

def clean_text_regex(words_list, regex="[^a-zA-Z]+", replace='', min_word_len=1):
    """Testado em https://regex101.com/"""
    new_words = []
    for word in words_list:
        word = re.sub(regex, replace, word)
        if len(word) > min_word_len:
            new_words.append(word)
    return new_words

def remove_stopwords(words_list, stopwords_list):
    """"""
    new_words = []
    for word in words_list:
        if word not in stopwords_list:
            new_words.append(word)
    return new_words

def lemmatizer(words_list):
    """"""
    obj_lemmatizer = nltk.stem.WordNetLemmatizer()
    words_lemma = []
    for word in words_list:
        words_lemma.append(obj_lemmatizer.lemmatize(word,pos=nltk.corpus.wordnet.VERB))
    return words_lemma

def stem_text(words_list):
    """"""
    p_stem = nltk.stem.PorterStemmer()
    words_stem = []
    for word in words_list:
        words_stem.append(p_stem.stem(word))
    return words_stem

In [44]:
def text_prep(text, clean_text=True, stopwords_remove=True, exec_lemmatizer=True, exec_stem=False, text_lower=False, stopwords_list=[], language='english',
              preserve_line=False, regex_chars_clean="[^a-zA-Z]+", replace_chars_clean='', min_word_len=1):
    
    """Text preparation."""
    
    text_preparation = text_tokenize(text, language=language, preserve_line=preserve_line)
    if clean_text:
        text_preparation = clean_text_regex(words_list=text_preparation,
                                            regex=regex_chars_clean,
                                            replace=replace_chars_clean,
                                            min_word_len=min_word_len)
    if stopwords_remove:
        text_preparation = remove_stopwords(words_list=text_preparation,
                                            stopwords_list=stopwords_list)
    if exec_lemmatizer:
        text_preparation = lemmatizer(words_list=text_preparation)
    if exec_stem:
        text_preparation = stem_text(words_list=text_preparation)
    text_preparation = ' '.join(text_preparation)
    if text_lower:
        text_preparation = text_preparation.lower()
    return text_preparation

In [50]:
def text_prep_column(colum_df):
    """"""
    f_prep_text = lambda text_data: text_prep(text=text_data, clean_text=True, stopwords_remove=True, exec_lemmatizer=True, exec_stem=False, 
                                            stopwords_list=nltk.corpus.stopwords.words('english'), language='english', preserve_line=False,
                                            regex_chars_clean="[^a-zA-Z]+", replace_chars_clean='', min_word_len=1, text_lower=True)
    colum_df = colum_df.apply(lambda e: e if pd.isna(e) else f_prep_text(e))
    return colum_df

In [51]:
dict_dfs['df_doc_info'].abstract.iat[0]

'In-situ detection of processing defects is a critical challenge for Laser Powder Bed Fusion Additive Manufacturing. Many of these defects are related to interactions between the recoater blade, which spreads the powder, and the powder bed. This work leverages Deep Learning, specifically a Convolutional Neural Network (CNN), for autonomous detection and classification of many of these spreading anomalies. Importantly, the input layer of the CNN is modified to enable the algorithm to learn both the appearance of the powder bed anomalies as well as key contextual information at multiple size scales. These modifications to the CNN architecture are shown to improve the flexibility and overall classification accuracy of the algorithm while mitigating many human biases. A case study is used to demonstrate the utility of the presented methodology and the overall performance is shown to be superior to that of methodologies previously reported by the authors.'

In [52]:
text_prep(text=dict_dfs['df_doc_info'].abstract.iat[0],
          clean_text=True,
          stopwords_remove=True,
          exec_lemmatizer=True,
          exec_stem=False,
          text_lower=True,
          stopwords_list=nltk.corpus.stopwords.words('english'),
          language='english',
          preserve_line=False,
          regex_chars_clean="[^a-zA-Z]+",
          replace_chars_clean='',
          min_word_len=1)

'insitu detection process defect critical challenge laser powder bed fusion additive manufacturing many defect relate interactions recoater blade spread powder powder bed this work leverage deep learning specifically convolutional neural network cnn autonomous detection classification many spread anomalies importantly input layer cnn modify enable algorithm learn appearance powder bed anomalies well key contextual information multiple size scale these modifications cnn architecture show improve flexibility overall classification accuracy algorithm mitigate many human bias case study use demonstrate utility present methodology overall performance show superior methodologies previously report author'

In [53]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,8A005E0AC489DFE9B5527FC54743A71B,39ED4EE32409D1D19F117A05FE91A6C0,1F79E1760A42B6D9E6F31886E30D5086
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:08:00,2022-05-15 02:10:00,2022-05-15 02:11:00
language_code,en,en,en
acknowledgement,Acknowledgements The build used as a case stud...,Acknowledgement The authors would like to appr...,
abstract,In-situ detection of processing defects is a c...,"At the temperature raging from 700 to 950 ‚Ä¢ C,...",Alloy design and properties optimization of mu...
body,Introduction Many of the applications best sui...,Introduction Nitrogen alloyed stainless steels...,"Introduction Multi-component alloy, or high en..."
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [54]:
dict_dfs['df_doc_info']['acknowledgement_prep'] = text_prep_column(dict_dfs['df_doc_info']['acknowledgement'])
dict_dfs['df_doc_info']['abstract_prep'] = text_prep_column(dict_dfs['df_doc_info']['abstract'])
dict_dfs['df_doc_info']['body_prep'] = text_prep_column(dict_dfs['df_doc_info']['body'])

In [55]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,8A005E0AC489DFE9B5527FC54743A71B,39ED4EE32409D1D19F117A05FE91A6C0,1F79E1760A42B6D9E6F31886E30D5086
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:08:00,2022-05-15 02:10:00,2022-05-15 02:11:00
language_code,en,en,en
acknowledgement,Acknowledgements The build used as a case stud...,Acknowledgement The authors would like to appr...,
abstract,In-situ detection of processing defects is a c...,"At the temperature raging from 700 to 950 ‚Ä¢ C,...",Alloy design and properties optimization of mu...
body,Introduction Many of the applications best sui...,Introduction Nitrogen alloyed stainless steels...,"Introduction Multi-component alloy, or high en..."
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
