# Testando Cliente do GROBID:

---------------------------------

### Importando dependências

In [1]:
import os
import sys
import re

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

pip install grobid-tei-xml

### Definindo variáveis e caminhos

In [2]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','test_article')
path_output = os.path.join(path,'output','xml')
path_article = os.path.join(path,'artifacts','test_article','b617684b.pdf')

---------------------------------

### Definindo variáveis e caminhos

In [3]:
gcli = grobid_cli(config_path="./grobid/config.json")

GROBID server is up and running


In [4]:
result_batch = gcli.process_pdfs(input_path=path_input,
                                 n_workers=2,
                                 service="processFulltextDocument",
                                 generateIDs=True,
                                 include_raw_citations=True,
                                 include_raw_affiliations=True,
                                 consolidate_header=False,
                                 consolidate_citations=False,
                                 tei_coordinates=False,
                                 segment_sentences=True,
                                 verbose=True)

2 files to process in current batch


https://komax.github.io/blog/text/mining/grobid/

https://komax.github.io/blog/text/python/xml/parsing_tei_xml_python/

https://github.com/allenai/s2orc-doc2json

https://gitlab.com/internetarchive/grobid_tei_xml

https://github.com/delb-xml/delb-py

---------------------------------

### Transformando JSON em um DataFrame

- Há as seguintes entendidades: artigo, autores, instituições e referências;
- Compondo uma publicação científica;
    - Dentro do artigo temos várias informações da entidade de dados;
    - E as referências possuem vários artigos, uma lista de artigos;

In [5]:
xml_to_df = xmltei_to_dataframe()

In [14]:
import concurrent

In [6]:
dict_dfs = xml_to_df.get_dataframe_articles(result_batch)

In [7]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

In [8]:
dict_dfs['df_doc_info']

Unnamed: 0_level_0,grobid_version,grobid_timestamp,language_code,acknowledgement,abstract,body,annex,file,status,raw_data
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,0.7.0,2022-04-25 22:06:00,en,Acknowledgements This work was carried out wit...,This article surveys and highlights the integr...,Introduction It is foreseeable that artificial...,,c:\Users\vierb\OneDrive\Área de Trabalho\Proje...,200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
EDC05F16295D6502DECA9A504932B81E,0.7.0,2022-04-25 22:06:00,en,Acknowledgements This work was supported by th...,We have performed powder neutron diffraction m...,"Introduction In recent years, the solid-state ...",,c:\Users\vierb\OneDrive\Área de Trabalho\Proje...,200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [9]:
dict_dfs['df_doc_head']

Unnamed: 0_level_0,index_head,id_head,unstructured_head,date_head,title_head,book_title_head,series_title_head,journal_head,journal_abbrev_head,publisher_head,...,first_page_head,last_page_head,note_head,doi_head,pmid_head,pmcid_head,arxiv_id_head,ark_head,istex_id_head,url_head
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,,,,2007-02-08,Nanometer scale carbon structures for charge-t...,,,,,,...,,,,10.1039/b617684b,,,,,,
EDC05F16295D6502DECA9A504932B81E,,,,2000-06-21,Magnetic properties and crystal structures of ...,,,,,,...,,,,10.1039/b000689k,,,,,,


In [10]:
dict_dfs['df_doc_authors']

Unnamed: 0_level_0,full_name_author,given_name_author,middle_name_author,surname_author,email_author,orcid_author,institution_author,department_author,laboratory_author,addr_line_author,post_code_author,settlement_author,country_author
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,Dirk M Guldi,Dirk,M,Guldi,,,Universita¨t Erlangen,Institute of Physical and Theoretical Chemistry,,,91058,Erlangen,Germany
EDC05F16295D6502DECA9A504932B81E,Yoshihiro Doi,Yoshihiro,,Doi,,,Hokkaido University,Graduate School of Science,,,060-0810,Sapporo,Japan
EDC05F16295D6502DECA9A504932B81E,Yukio Hinatsu,Yukio,,Hinatsu,,,Hokkaido University,Graduate School of Science,,,060-0810,Sapporo,Japan
EDC05F16295D6502DECA9A504932B81E,Ken-Ichi Oikawa,Ken-Ichi,,Oikawa,,,Japan Atomic Energy Research Institute,,,,319-1195,"Tokai-mura, Ibaraki",Japan
EDC05F16295D6502DECA9A504932B81E,Yutaka Shimojo,Yutaka,,Shimojo,,,Japan Atomic Energy Research Institute,,,,319-1195,"Tokai-mura, Ibaraki",Japan
EDC05F16295D6502DECA9A504932B81E,Yukio Morii,Yukio,,Morii,,,Japan Atomic Energy Research Institute,,,,319-1195,"Tokai-mura, Ibaraki",Japan


In [11]:
dict_dfs['df_doc_citations']

Unnamed: 0_level_0,index_citation,id_citation,unstructured_citation,date_citation,title_citation,book_title_citation,series_title_citation,journal_citation,journal_abbrev_citation,publisher_citation,...,first_page_citation,last_page_citation,note_citation,doi_citation,pmid_citation,pmcid_citation,arxiv_id_citation,ark_citation,istex_id_citation,url_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,0,b0,Basic Research Needs for Solar Energy Utilizat...,2005,Basic Research Needs for Solar Energy Utilization,,,Report of the Basic Energy Sciences Workshop o...,,,...,,,,,,,,,,http://www.sc.doe.gov/bes/reports/files/SEU_rp...
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,1,b1,"V. Balzani, Electron Transfer in Chemistry, Wi...",2001,Electron Transfer in Chemistry,,,Weinheim,,Wiley-VCH,...,,,,,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,2,b2,"A. Hirsch and M. Brettreich, Fullerenes: Chemi...",2000,Fullerenes: From Synthesis to Optoelectronic P...,Nuclear and Radiation Chemical Approaches to F...,,,,Kluwer Academic,...,,,"Fullerenes: Chemistry, Physics, and Technology",,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,3,b3,"F. Diederich, L. Isaacs and D. Philp, Chem. So...",1994,,,,Compt. Rend. Chim,,,...,,,Chem. Soc. Rev.,,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,4,b4,"S. Reich, C. Thomsen and J. Maultzsch, Carbon ...",2004,Carbon Nanotubes: Basic Concepts and Physical ...,,,,,Wiley-VCH,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EDC05F16295D6502DECA9A504932B81E,4,b4,"P. D. Battle, C. W. Jones and F. Studer, J. So...",1991,,,,J. Solid State Chem,,,...,,,,,,,,,,
EDC05F16295D6502DECA9A504932B81E,5,b5,"D. Harada, M. Wakeshima and Y. Hinatsu, J. Sol...",1999,,,,J. Solid State Chem,,,...,,,,,,,,,,
EDC05F16295D6502DECA9A504932B81E,6,b6,"D. Harada, M. Wakeshima, Y. Hinatsu K. Ohoyama...",2000,,,,J. Phys.: Condens. Matter,,,...,,,,,,,,,,
EDC05F16295D6502DECA9A504932B81E,7,b7,"F. Izumi, in The Rietveld Method, ed. R. A. Yo...",1993,The Rietveld Method,,,,,Oxford University Press,...,,,,,,,,,,


In [12]:
dict_dfs['df_doc_authors_citations']

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation,email_citation,orcid_citation,institution_citation,department_citation,laboratory_citation,addr_line_citation,post_code_citation,settlement_citation,country_citation
pdf_md5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,b0,0,,,,,,,,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,b1,1,V Balzani,V,,Balzani,,,,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,b2,2,A Hirsch,A,,Hirsch,,,,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,b2,2,M Brettreich,M,,Brettreich,,,,,,,,,
5B70A65B9D2AEC3DD3E6A64B4BD94CB2,b2,2,: Fullerenes,:,,Fullerenes,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EDC05F16295D6502DECA9A504932B81E,b8,8,K Tezuka,K,,Tezuka,,,,,,,,,
EDC05F16295D6502DECA9A504932B81E,b8,8,Y Hinatsu,Y,,Hinatsu,,,,,,,,,
EDC05F16295D6502DECA9A504932B81E,b8,8,K Oikawa,K,,Oikawa,,,,,,,,,
EDC05F16295D6502DECA9A504932B81E,b8,8,Y Shimojo,Y,,Shimojo,,,,,,,,,


In [13]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.8.1-py2.py3-none-any.whl (10.1 MB)
Collecting blinker
  Downloading blinker-1.4.tar.gz (111 kB)
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.7.1-py2.py3-none-any.whl (4.3 MB)
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting validators
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting protobuf!=3.11,>=3.6.0
  Downloading protobuf-3.20.1-cp38-cp38-win_amd64.whl (904 kB)
Collecting pyarrow
  Downloading pyarrow-7.0.0-cp38-cp38-win_amd64.whl (16.1 MB)
Collecting tzlocal
  Downloading tzlocal-4.2-py3-none-any.whl (19 kB)
Collecting pympler>=0.9
  Downloading Pympler-1.0.1-py3-none-any.whl (164 kB)
Collecting altair>=3.2.0
  Downloading altair-4.2.0-py3-none-any.whl (812 kB)
Collecting cachetools>=4.0
  Downloading cachetools-5.0.0-py3-none-any.whl (9.1 kB)
Collecting semver
  Downloading semver-2.13.0-py2.py3-none-any.whl (12 kB)
Collecting gitdb<5,>=4.0.1
  Downloading git