# Usando processo bath em um conjunto maior de artigos - Cliente do GROBID:

---------------------------------

### Importando depend√™ncias

In [1]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

!pip install nltk --upgrade

!pip install gensim --upgrade

!pip install spacy --upgrade

!pip install stanford-corenlp --upgrade

!pip install corenlp --upgrade

!pip install textblob --upgrade

!pip install transformers --upgrade

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

pip install grobid-tei-xml

### Definindo vari√°veis e caminhos

In [3]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Fun√ß√µes para execu√ß√£o em batch

In [4]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2, check_cache=True, cache_folder_name='summarticles_cache', config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [5]:
def run_batch_process(path_input, n_workers=6, check_cache=True, cache_folder_name='summarticles_cache', config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors


def tokenize_data(df_colum):
    """"""
        

In [6]:
input_folder_path = r"""C:\Users\vierb\OneDrive\√Årea de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [7]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 25.7 s


In [11]:
print(dict_exec.keys())

dict_keys(['path', 'start_datetime', 'grobid_config', 'files', 'num_files', 'n_workers', 'end_datetime', 'time_exec_sec', 'time_exec_min'])


In [12]:
dic_errors

{'number_article_error': 6,
 'list_article_error': [{'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\s41578-021-00351-7.pdf',
   'error': ValueError,
   'error_text': 'If using all scalar values, you must pass an index',
   'keys_dict': dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'body'])},
  {'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\advs.201903667.pdf',
   'error': ValueError,
   'error_text': 'If using all scalar values, you must pass an index',
   'keys_dict': dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'body'])},
  {'file': 'C:\\Users\\vierb\\OneDrive\\√Årea de Trabalho\\Projetos\\PGC\\artifacts\\articles\\ml_material\\j.commatsci.2020.109782.pdf',
   'error': xml.etree.ElementTree.ParseError,
   'error_text': 'syntax error: line 1, column 0',
   'keys_

In [None]:
dict_dfs['df_doc_info'].head(3).T

pdf_md5,F93B8E7FE9948A2FDBD600CE985ADCF8,4B56666A82B4FB26D98E4EF7A711800A,5C496F0824C8E623FD20401DE368953C
grobid_version,0.7.0,0.7.0,0.7.0
grobid_timestamp,2022-05-15 02:09:00,2022-05-15 02:08:00,2022-05-15 02:08:00
language_code,en,en,en
acknowledgement,Acknowledgement This work was partially funded...,Acknowledgements The authors would like to tha...,
abstract,It is an open question how the particle micros...,Figure 12. Schematic drawing showing preparati...,The ORCID identification number(s) for the aut...
body,Introduction and motivation Lithium-ion batter...,Introduction In response to the impending ener...,Introduction Pearlite transformation has recen...
annex,,,
file,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...,C:\Users\vierb\OneDrive\√Årea de Trabalho\Proje...
status,status 200,status 200,status 200
raw_data,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [None]:
dict_dfs['df_doc_head'].head(3).T

pdf_md5,F93B8E7FE9948A2FDBD600CE985ADCF8,4B56666A82B4FB26D98E4EF7A711800A,5C496F0824C8E623FD20401DE368953C
index_head,,,
id_head,,,
unstructured_head,,,
date_head,,,
title_head,Crack detection in lithium-ion cells using mac...,Improved Thermoelectric Properties of Hot-Extr...,Modeling of Alloying Effect on Isothermal Tran...
book_title_head,,,
series_title_head,,,
journal_head,,,
journal_abbrev_head,,,
publisher_head,,,


In [None]:
dict_dfs['df_doc_authors'].head(3).T

pdf_md5,F93B8E7FE9948A2FDBD600CE985ADCF8,F93B8E7FE9948A2FDBD600CE985ADCF8.1,F93B8E7FE9948A2FDBD600CE985ADCF8.2
full_name_author,Lukas Petrich,Daniel Westhoff,Julian Feinauer
given_name_author,Lukas,Daniel,Julian
middle_name_author,,,
surname_author,Petrich,Westhoff,Feinauer
email_author,lukas.petrich@uni-ulm.de,,
orcid_author,,,
institution_author,Ulm University,Ulm University,Ulm University
department_author,Institute of Stochastics,Institute of Stochastics,Institute of Stochastics
laboratory_author,,,
addr_line_author,,,


In [None]:
dict_dfs['df_doc_citations'].head(5)

Unnamed: 0_level_0,index_citation,id_citation,unstructured_citation,date_citation,title_citation,book_title_citation,series_title_citation,journal_citation,journal_abbrev_citation,publisher_citation,...,first_page_citation,last_page_citation,note_citation,doi_citation,pmid_citation,pmcid_citation,arxiv_id_citation,ark_citation,istex_id_citation,url_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F93B8E7FE9948A2FDBD600CE985ADCF8,0,b0,"D.P. Finegan, M. Scheel, J.B. Robinson, B. Tja...",2015,Inoperando high-speed tomography of lithium-io...,,,Nat. Commun,,,...,,,,10.1038/ncomms7924,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,1,b1,"D.P. Finegan, M. Scheel, J.B. Robinson, B. Tja...",2016,Investigating lithium-ion battery materials du...,,,PCCP,,,...,30912.0,30919.0,,10.1039/c6cp04251a,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,2,b2,"J. Jiang, J. Dahn, Effects of particle size an...",2004,Effects of particle size and electrolyte salt ...,,,Electrochim. Acta,,,...,2661.0,2666.0,,10.1016/j.electacta.2004.02.017,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,3,b3,"J. Geder, H.E. Hoster, A. Jossen, J. Garche, D...",2014,Impact of active material surface area on ther...,,,J. Power Sources,,,...,286.0,292.0,,10.1016/j.jpowsour.2014.01.116,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,4,b4,"L. Gillibert, D. Jeulin, 3D reconstruction and...",2013,3D reconstruction and analysis of the fragment...,,,Image Anal. Stereol,,,...,107.0,115.0,,10.5566/ias.v32.p107-115,,,,,,


In [None]:
dict_dfs['df_doc_authors_citations'].head(5)

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation,email_citation,orcid_citation,institution_citation,department_citation,laboratory_citation,addr_line_citation,post_code_citation,settlement_citation,country_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
F93B8E7FE9948A2FDBD600CE985ADCF8,b0,0,D P Finegan,D,P,Finegan,,,,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,b0,0,M Scheel,M,,Scheel,,,,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,b0,0,J B Robinson,J,B,Robinson,,,,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,b0,0,B Tjaden,B,,Tjaden,,,,,,,,,
F93B8E7FE9948A2FDBD600CE985ADCF8,b0,0,I Hunt,I,,Hunt,,,,,,,,,


---------------------------------

### Trabalhando no tratamento do texto

!pip install pytorch --upgrade

!pip install tensorflow --upgrade --user

In [13]:
import nltk
import spacy
import corenlp
import textblob
import gensim
#import transformers

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [55]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [57]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vierb\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

--------------------------------

Estudando a distribui√ß√£o de caracteres especiais:

In [16]:
list_chars = []
for id,row in dict_dfs['df_doc_info'].iterrows():
    for c in row['raw_data']:
        list_chars.append(c)
        
df_counts = pd.DataFrame({'chars':pd.value_counts(list_chars).index.tolist(),'counts':pd.value_counts(list_chars).tolist()})
df_counts = df_counts.sort_values(by='counts',ascending=False)

import plotly.express as px
fig = px.bar(df_counts.head(20), x='chars', y='counts')
fig.show()

In [17]:
df_counts.sort_values(by='counts',ascending=False).tail(30)

Unnamed: 0,chars,counts
741,–∞,1
740,ùë≠,1
739,ÔÇ≥,1
742,ùê®,1
749,‚áë,1
760,‚ëß,1
726,ƒÄ,1
750,‚ë¢,1
752,Ó¥¶,1
753,ùüï,1


--------------------------------

Tokenizando os dados:

In [26]:
abstract_tokenize = nltk.tokenize.word_tokenize(dict_dfs['df_doc_info'].abstract.iat[0], language='english', preserve_line=False)

Aplicando regex para eleminar qualquer caractere especial, uma simplifica√ß√£o r√°pida para avan√ßar:

In [89]:
regex = "[^a-zA-Z]+" # Testado em https://regex101.com/
replace = ''

words_only_chars = []
for word in abstract_tokenize:
    word = re.sub(regex, replace, word)
    
    if len(word) > 1:
        words_only_chars.append(word)

In [90]:
' '.join(words_only_chars)

'It is an open question how the particle microstructure of lithiumion electrode influences potential thermal runaway In order to investigate this information on the structural changes in particular cracked particles caused by the failure are desirable For reliable analysis of these changes reasonably large amount of data is necessary which necessitates automatic extraction of particle cracks from tomographic image data In this paper classification model is proposed which is able to decide whether pair of particles is the result of breakage of the image segmentation or neither The classifier is developed using simulated data based on stochastic particle model Its validity is tested by applying the methodology to handlabelled data from real electrode For this dataset an overall accuracy of is achieved'

Removendo stopwords:

In [91]:
stopwords = nltk.corpus.stopwords.words('english')

new_words = []
for word in words_only_chars:
    if word not in stopwords:
        new_words.append(word)

In [92]:
dict_dfs['df_doc_info'].abstract.iat[0]

'It is an open question how the particle microstructure of a lithium-ion electrode influences a potential thermal runaway. In order to investigate this, information on the structural changes, in particular cracked particles, caused by the failure are desirable. For a reliable analysis of these changes a reasonably large amount of data is necessary, which necessitates automatic extraction of particle cracks from tomographic 3D image data. In this paper, a classification model is proposed which is able to decide whether a pair of particles is the result of breakage, of the image segmentation, or neither. The classifier is developed using simulated data based on a 3D stochastic particle model. Its validity is tested by applying the methodology to hand-labelled data from a real electrode. For this dataset, an overall accuracy of 73% is achieved.'

In [93]:
' '.join(new_words)

'It open question particle microstructure lithiumion electrode influences potential thermal runaway In order investigate information structural changes particular cracked particles caused failure desirable For reliable analysis changes reasonably large amount data necessary necessitates automatic extraction particle cracks tomographic image data In paper classification model proposed able decide whether pair particles result breakage image segmentation neither The classifier developed using simulated data based stochastic particle model Its validity tested applying methodology handlabelled data real electrode For dataset overall accuracy achieved'

Lematiza√ß√£o:

In [94]:
obj_lemmatizer = nltk.stem.WordNetLemmatizer()

In [95]:
words_lemma = []
for word in new_words:
    words_lemma.append(obj_lemmatizer.lemmatize(word,pos=nltk.corpus.wordnet.VERB))

In [96]:
' '.join(words_lemma)

'It open question particle microstructure lithiumion electrode influence potential thermal runaway In order investigate information structural change particular crack particles cause failure desirable For reliable analysis change reasonably large amount data necessary necessitate automatic extraction particle crack tomographic image data In paper classification model propose able decide whether pair particles result breakage image segmentation neither The classifier develop use simulate data base stochastic particle model Its validity test apply methodology handlabelled data real electrode For dataset overall accuracy achieve'

In [97]:
' '.join(new_words)

'It open question particle microstructure lithiumion electrode influences potential thermal runaway In order investigate information structural changes particular cracked particles caused failure desirable For reliable analysis changes reasonably large amount data necessary necessitates automatic extraction particle cracks tomographic image data In paper classification model proposed able decide whether pair particles result breakage image segmentation neither The classifier developed using simulated data based stochastic particle model Its validity tested applying methodology handlabelled data real electrode For dataset overall accuracy achieved'

Stemiza√ß√£o:

In [98]:
p_stem = nltk.stem.PorterStemmer()

In [99]:
words_stem = []
for word in words_lemma:
    words_stem.append(p_stem.stem(word))

In [100]:
' '.join(words_stem)

'it open question particl microstructur lithiumion electrod influenc potenti thermal runaway in order investig inform structur chang particular crack particl caus failur desir for reliabl analysi chang reason larg amount data necessari necessit automat extract particl crack tomograph imag data in paper classif model propos abl decid whether pair particl result breakag imag segment neither the classifi develop use simul data base stochast particl model it valid test appli methodolog handlabel data real electrod for dataset overal accuraci achiev'

In [101]:
' '.join(new_words)

'It open question particle microstructure lithiumion electrode influences potential thermal runaway In order investigate information structural changes particular cracked particles caused failure desirable For reliable analysis changes reasonably large amount data necessary necessitates automatic extraction particle cracks tomographic image data In paper classification model proposed able decide whether pair particles result breakage image segmentation neither The classifier developed using simulated data based stochastic particle model Its validity tested applying methodology handlabelled data real electrode For dataset overall accuracy achieved'