# Testando Cliente do GROBID:

---------------------------------

### Importando dependências

In [1]:
import os
import sys
import re

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

pip install grobid-tei-xml

### Definindo variáveis e caminhos

In [2]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','test_article')
path_output = os.path.join(path,'output','xml')
path_article = os.path.join(path,'artifacts','test_article','b617684b.pdf')

host = 'http://localhost'
port = 8070

---------------------------------

### Definindo variáveis e caminhos

In [6]:
client = grobid_client.GrobidClient(config_path="./grobid/config.json",timeout=360,grobid_port='8070')

GROBID server is up and running


In [33]:
pdf_file, status, xml = client.process_pdf(service="processFulltextDocument",
                                           pdf_file=path_article,
                                           generateIDs=True,
                                           consolidate_header=False, # Usa informações externas para consolidar informações de cabeçalho
                                           consolidate_citations=False, # Usa informações externas para consolidar informações de bibliografia
                                           include_raw_citations=True, # Citações
                                           include_raw_affiliations=True, # Afiliações
                                           tei_coordinates=False, # Gera coordenadas para gerar visualização de marcações no PDF
                                           segment_sentences=True) # Usa um motor externo para segmentar as sentenças

In [34]:
pdf_file

'c:\\Users\\vierb\\OneDrive\\Área de Trabalho\\Projetos\\PGC\\artifacts\\test_article\\b617684b.pdf'

In [35]:
status # Status 408 é timeout

200

In [36]:
xml[0:2000]

'<?xml version="1.0" encoding="UTF-8"?>\n<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" \nxmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" \nxsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"\n xmlns:xlink="http://www.w3.org/1999/xlink">\n\t<teiHeader xml:lang="en">\n\t\t<fileDesc>\n\t\t\t<titleStmt>\n\t\t\t\t<title level="a" type="main" xml:id="_yK77Q7w">Nanometer scale carbon structures for charge-transfer systems and photovoltaic applications</title>\n\t\t\t</titleStmt>\n\t\t\t<publicationStmt>\n\t\t\t\t<publisher/>\n\t\t\t\t<availability status="unknown"><licence/></availability>\n\t\t\t\t<date type="published" when="2007-02-08">08 February 2007</date>\n\t\t\t</publicationStmt>\n\t\t\t<sourceDesc>\n\t\t\t\t<biblStruct>\n\t\t\t\t\t<analytic>\n\t\t\t\t\t\t<author>\n\t\t\t\t\t\t\t<persName><forename type="first">Dirk</forename><forename type="middle">M</forename><surname>Guldi

---------------------------------

### Convertendo documento XML TEI em JSON

In [37]:
dir(grobid_tei_xml)

['GrobidBiblio',
 'GrobidDocument',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'parse',
 'parse_citation_list_xml',
 'parse_citation_xml',
 'parse_citations_xml',
 'parse_document_xml',
 'types']

In [38]:
doc = grobid_tei_xml.parse_document_xml(xml)
doc = doc.to_dict()

doc.keys()

dict_keys(['grobid_version', 'grobid_timestamp', 'header', 'pdf_md5', 'language_code', 'citations', 'abstract', 'body', 'acknowledgement'])

In [39]:
doc['grobid_version']

'0.7.0'

In [40]:
doc['grobid_timestamp']

'2022-04-23T19:46+0000'

In [15]:
doc['header']

{'authors': [{'full_name': 'Dirk M Guldi',
   'given_name': 'Dirk',
   'middle_name': 'M',
   'surname': 'Guldi',
   'affiliation': {'institution': 'Universita¨t Erlangen',
    'department': 'Institute of Physical and Theoretical Chemistry',
    'address': {'post_code': '91058',
     'settlement': 'Erlangen',
     'country': 'Germany'}}}],
 'date': '2007-02-08',
 'title': 'Nanometer scale carbon structures for charge-transfer systems and photovoltaic applications',
 'doi': '10.1039/b617684b'}

In [16]:
doc['language_code']

'en'

In [18]:
doc['header']

{'authors': [{'full_name': 'Dirk M Guldi',
   'given_name': 'Dirk',
   'middle_name': 'M',
   'surname': 'Guldi',
   'affiliation': {'institution': 'Universita¨t Erlangen',
    'department': 'Institute of Physical and Theoretical Chemistry',
    'address': {'post_code': '91058',
     'settlement': 'Erlangen',
     'country': 'Germany'}}}],
 'date': '2007-02-08',
 'title': 'Nanometer scale carbon structures for charge-transfer systems and photovoltaic applications',
 'doi': '10.1039/b617684b'}

In [19]:
doc['citations']

[{'authors': [],
  'index': 0,
  'id': 'b0',
  'unstructured': 'Basic Research Needs for Solar Energy Utilization, Report of the Basic Energy Sciences Workshop on Solar Energy Utilization April 18-21, 2005-http://www.sc.doe.gov/bes/reports/files/ SEU_rpt.pdf.',
  'date': '2005',
  'title': 'Basic Research Needs for Solar Energy Utilization',
  'journal': 'Report of the Basic Energy Sciences Workshop on Solar Energy',
  'url': 'http://www.sc.doe.gov/bes/reports/files/SEU_rpt.pdf'},
 {'authors': [{'full_name': 'V Balzani',
    'given_name': 'V',
    'surname': 'Balzani'}],
  'index': 1,
  'id': 'b1',
  'unstructured': 'V. Balzani, Electron Transfer in Chemistry, Wiley-VCH, Wein- heim, 2001, vol. I-V.',
  'date': '2001',
  'title': 'Electron Transfer in Chemistry',
  'journal': 'Weinheim',
  'publisher': 'Wiley-VCH'},
 {'authors': [{'full_name': 'A Hirsch',
    'given_name': 'A',
    'surname': 'Hirsch'},
   {'full_name': 'M Brettreich', 'given_name': 'M', 'surname': 'Brettreich'},
   {'f

In [20]:
doc['abstract']

'This article surveys and highlights the integration of nanometer scale carbon structures-in combination with chromophores that exhibit (i) significant absorption cross section throughout the visible part of the solar spectrum and (ii) good electron donating power-into novel electron donor-acceptor conjugates (i.e., covalent) and hybrids (i.e., non-covalent). The focus of this article is predominantly on performance features-charge-transfer and photovoltaic-of the most promising solar energy conversion systems. Besides documenting fundamental advantages as they emerge around nanometer scale carbon structures, critical evaluations of the most recent developments in the fields are provided. Scheme 1 Molecular building blocks-C 60 and single wall carbon nanotubes (SWNT)-as electron acceptors.'

In [23]:
doc['acknowledgement']

'Acknowledgements This work was carried out with partial support from the SFB 583, DFG (GU 517/4-1), FCI, and the Office of Basic Energy Sciences of the U.S. Department of Energy. I am deeply indebted to Profs. Ford, Fukuzumi, Hirsch, Imahori, Jux, Kotov, Maggini, Martin, Prato, Schuster, Sessler, Torres, Valli, and Zilbermann for their productive collaborations and numerous stimulating discussions. This Invited Article is dedicated to Professor Ortwin Brede on the occasion of his 65th birthday and his retirement.'

In [26]:
doc['body'][0:2000]

'Introduction It is foreseeable that artificial photosynthetic systems that will ultimately power practical solar fuels production must be based on molecular and supramolecular assemblies.  1  Specific requirements that such assemblies must meet include the collection of light energy, separation of charges, and transport of charges to catalytic sites, where water oxidation and CO 2 reduction will occur. Notable progress has been made on each aspect of these complex problems-yet researchers have not developed components that are both efficient and robust, and have not integrated the existing functional components into a working system.  2  The design and development of light harvesting, photoconversion, and-as a long term aim-catalytic modules capable of self-ordering and self-assembling into an integrated functional unit will make it possible to realize efficient artificial photosynthetic systems based on nanometer scale carbon structures. \n Electron acceptors A first prominent exampl

https://komax.github.io/blog/text/mining/grobid/

https://komax.github.io/blog/text/python/xml/parsing_tei_xml_python/

https://github.com/allenai/s2orc-doc2json

https://gitlab.com/internetarchive/grobid_tei_xml

https://github.com/delb-xml/delb-py

---------------------------------

### Transformando JSON em um DataFrame

- Há as seguintes entendidades: artigo, autores, instituições e referências;
- Compondo uma publicação científica;
    - Dentro do artigo temos várias informações da entidade de dados;
    - E as referências possuem vários artigos, uma lista de artigos;

In [54]:
import datetime
import json

In [84]:
def get_process_datetime(doc,varnull=np.nan,deltah=3):
    r = doc.get('grobid_timestamp',varnull)
    if not pd.isna(r):
        r = pd.to_datetime(doc['grobid_timestamp'],errors='coerce',format='%Y-%m-%dT%H:%M+0000')
        r = r-pd.Timedelta(deltah,unit='hour')
        r = r.strftime('%Y-%m-%d %H:%M:00')
    return r

In [143]:
def get_doc(doc):
    
    """Get article information from the article documment"""
    
    dict_article = {'grobid_version':doc.get('grobid_version',np.nan),
                    'grobid_timestamp':get_process_datetime(doc,np.nan,3),
                    'pdf_md5':doc.get('pdf_md5',np.nan),
                    'language_code':doc.get('language_code',np.nan),
                    'acknowledgement':doc.get('acknowledgement',np.nan),
                    'abstract':doc.get('abstract',np.nan),
                    'body':doc.get('body',np.nan),
                    'annex ':doc.get('annex ',np.nan)}
    
    return [dict_article]


def get_head(doc, suffix='head'):
    
    """Get head information from the article documment"""

    default_dict = {'pdf_md5':doc.get('pdf_md5',np.nan),
                    '_'.join(['index',suffix]):np.nan,
                    '_'.join(['id',suffix]):np.nan,
                    '_'.join(['unstructured',suffix]):np.nan,
                    '_'.join(['date',suffix]):np.nan,
                    '_'.join(['title',suffix]):np.nan,
                    '_'.join(['book_title',suffix]):np.nan,
                    '_'.join(['series_title',suffix]):np.nan,
                    '_'.join(['journal',suffix]):np.nan,
                    '_'.join(['journal_abbrev',suffix]):np.nan,
                    '_'.join(['publisher',suffix]):np.nan,
                    '_'.join(['institution',suffix]):np.nan,
                    '_'.join(['issn',suffix]):np.nan,
                    '_'.join(['eissn',suffix]):np.nan,
                    '_'.join(['volume',suffix]):np.nan,
                    '_'.join(['issue',suffix]):np.nan,
                    '_'.join(['pages',suffix]):np.nan,
                    '_'.join(['first_page',suffix]):np.nan,
                    '_'.join(['last_page',suffix]):np.nan,
                    '_'.join(['note',suffix]):np.nan,
                    '_'.join(['doi',suffix]):np.nan,
                    '_'.join(['pmid',suffix]):np.nan,
                    '_'.join(['pmcid',suffix]):np.nan,
                    '_'.join(['arxiv_id',suffix]):np.nan,
                    '_'.join(['ark',suffix]):np.nan,
                    '_'.join(['istex_id',suffix]):np.nan,
                    '_'.join(['url',suffix]):np.nan}
    
    head = doc.get('header',np.nan)
    if not pd.isna(head):
        dict_head = {'pdf_md5':doc.get('pdf_md5',np.nan),
                     '_'.join(['index',suffix]):head.get('index',np.nan),
                     '_'.join(['id',suffix]):head.get('id',np.nan),
                     '_'.join(['unstructured',suffix]):head.get('unstructured',np.nan),
                     '_'.join(['date',suffix]):head.get('date',np.nan),
                     '_'.join(['title',suffix]):head.get('title',np.nan),
                     '_'.join(['book_title',suffix]):head.get('book_title',np.nan),
                     '_'.join(['series_title',suffix]):head.get('series_title',np.nan),
                     '_'.join(['journal',suffix]):head.get('journal',np.nan),
                     '_'.join(['journal_abbrev',suffix]):head.get('journal_abbrev',np.nan),
                     '_'.join(['publisher',suffix]):head.get('publisher',np.nan),
                     '_'.join(['institution',suffix]):head.get('institution',np.nan),
                     '_'.join(['issn',suffix]):head.get('issn',np.nan),
                     '_'.join(['eissn',suffix]):head.get('eissn',np.nan),
                     '_'.join(['volume',suffix]):head.get('volume',np.nan),
                     '_'.join(['issue',suffix]):head.get('issue',np.nan),
                     '_'.join(['pages',suffix]):head.get('pages',np.nan),
                     '_'.join(['first_page',suffix]):head.get('first_page',np.nan),
                     '_'.join(['last_page',suffix]):head.get('last_page',np.nan),
                     '_'.join(['note',suffix]):head.get('note',np.nan),
                     '_'.join(['doi',suffix]):head.get('doi',np.nan),
                     '_'.join(['pmid',suffix]):head.get('pmid',np.nan),
                     '_'.join(['pmcid',suffix]):head.get('pmcid',np.nan),
                     '_'.join(['arxiv_id',suffix]):head.get('arxiv_id',np.nan),
                     '_'.join(['ark',suffix]):head.get('ark',np.nan),
                     '_'.join(['istex_id',suffix]):head.get('istex_id',np.nan),
                     '_'.join(['url',suffix]):head.get('url',np.nan)}
        return [dict_head]
    return [default_dict]


def get_authors(doc, key_doc='header',key_authors='authors',suffix='author'):
    
    """Get authors from the article documment"""
    
    default_fict = {'pdf_md5':doc.get('pdf_md5',np.nan),
                    '_'.join(['full_name',suffix]):np.nan,
                    '_'.join(['given_name',suffix]):np.nan,
                    '_'.join(['middle_name',suffix]):np.nan,
                    '_'.join(['surname',suffix]):np.nan,
                    '_'.join(['email',suffix]):np.nan,
                    '_'.join(['orcid',suffix]):np.nan,
                    '_'.join(['institution',suffix]):np.nan,
                    '_'.join(['department',suffix]):np.nan,
                    '_'.join(['laboratory',suffix]):np.nan,
                    '_'.join(['addr_line',suffix]):np.nan,
                    '_'.join(['post_code',suffix]):np.nan,
                    '_'.join(['settlement',suffix]):np.nan,
                    '_'.join(['country',suffix]):np.nan}
    
    head = doc.get(key_doc,np.nan)
    if not pd.isna(head):
        authors = head.get(key_authors,[])
        if len(authors):
            lista_authors = []
            for author in authors:
                affiliation = author.get('affiliation',np.nan)
                address = affiliation.get('address',np.nan) if not pd.isna(affiliation) else np.nan
                dict_authors = {'pdf_md5':doc.get('pdf_md5',np.nan),
                                '_'.join(['full_name',suffix]):author.get('full_name',np.nan),
                                '_'.join(['given_name',suffix]):author.get('given_name',np.nan),
                                '_'.join(['middle_name',suffix]):author.get('middle_name',np.nan),
                                '_'.join(['surname',suffix]):author.get('surname',np.nan),
                                '_'.join(['email',suffix]):author.get('email',np.nan),
                                '_'.join(['orcid',suffix]):author.get('orcid',np.nan),
                                '_'.join(['institution',suffix]):affiliation.get('institution',np.nan) if not pd.isna(affiliation) else np.nan,
                                '_'.join(['department',suffix]):affiliation.get('department',np.nan) if not pd.isna(affiliation) else np.nan,
                                '_'.join(['laboratory',suffix]):affiliation.get('laboratory',np.nan) if not pd.isna(affiliation) else np.nan,
                                '_'.join(['addr_line',suffix]):address.get('addr_line',np.nan) if not pd.isna(address) else np.nan,
                                '_'.join(['post_code',suffix]):address.get('post_code',np.nan) if not pd.isna(address) else np.nan,
                                '_'.join(['settlement',suffix]):address.get('settlement',np.nan) if not pd.isna(address) else np.nan,
                                '_'.join(['country',suffix]):address.get('country',np.nan) if not pd.isna(address) else np.nan}
                lista_authors.append(dict_authors)
            return lista_authors
        return [default_fict]
    return [default_fict]


def get_citations(doc, suffix='citation'):
    
    """Get citations informations from the article documment"""

    default_dict = {'pdf_md5':doc.get('pdf_md5',np.nan),
                    '_'.join(['index',suffix]):np.nan,
                    '_'.join(['id',suffix]):np.nan,
                    '_'.join(['unstructured',suffix]):np.nan,
                    '_'.join(['date',suffix]):np.nan,
                    '_'.join(['title',suffix]):np.nan,
                    '_'.join(['book_title',suffix]):np.nan,
                    '_'.join(['series_title',suffix]):np.nan,
                    '_'.join(['journal',suffix]):np.nan,
                    '_'.join(['journal_abbrev',suffix]):np.nan,
                    '_'.join(['publisher',suffix]):np.nan,
                    '_'.join(['institution',suffix]):np.nan,
                    '_'.join(['issn',suffix]):np.nan,
                    '_'.join(['eissn',suffix]):np.nan,
                    '_'.join(['volume',suffix]):np.nan,
                    '_'.join(['issue',suffix]):np.nan,
                    '_'.join(['pages',suffix]):np.nan,
                    '_'.join(['first_page',suffix]):np.nan,
                    '_'.join(['last_page',suffix]):np.nan,
                    '_'.join(['note',suffix]):np.nan,
                    '_'.join(['doi',suffix]):np.nan,
                    '_'.join(['pmid',suffix]):np.nan,
                    '_'.join(['pmcid',suffix]):np.nan,
                    '_'.join(['arxiv_id',suffix]):np.nan,
                    '_'.join(['ark',suffix]):np.nan,
                    '_'.join(['istex_id',suffix]):np.nan,
                    '_'.join(['url',suffix]):np.nan}
    
    citations = doc.get('citations',[])
    if len(citations):
        lista_citations = []
        for citation in citations:
            dict_cit = {'pdf_md5':doc.get('pdf_md5',np.nan),
                        '_'.join(['index',suffix]):citation.get('index',np.nan),
                        '_'.join(['id',suffix]):citation.get('id',np.nan),
                        '_'.join(['unstructured',suffix]):citation.get('unstructured',np.nan),
                        '_'.join(['date',suffix]):citation.get('date',np.nan),
                        '_'.join(['title',suffix]):citation.get('title',np.nan),
                        '_'.join(['book_title',suffix]):citation.get('book_title',np.nan),
                        '_'.join(['series_title',suffix]):citation.get('series_title',np.nan),
                        '_'.join(['journal',suffix]):citation.get('journal',np.nan),
                        '_'.join(['journal_abbrev',suffix]):citation.get('journal_abbrev',np.nan),
                        '_'.join(['publisher',suffix]):citation.get('publisher',np.nan),
                        '_'.join(['institution',suffix]):citation.get('institution',np.nan),
                        '_'.join(['issn',suffix]):citation.get('issn',np.nan),
                        '_'.join(['eissn',suffix]):citation.get('eissn',np.nan),
                        '_'.join(['volume',suffix]):citation.get('volume',np.nan),
                        '_'.join(['issue',suffix]):citation.get('issue',np.nan),
                        '_'.join(['pages',suffix]):citation.get('pages',np.nan),
                        '_'.join(['first_page',suffix]):citation.get('first_page',np.nan),
                        '_'.join(['last_page',suffix]):citation.get('last_page',np.nan),
                        '_'.join(['note',suffix]):citation.get('note',np.nan),
                        '_'.join(['doi',suffix]):citation.get('doi',np.nan),
                        '_'.join(['pmid',suffix]):citation.get('pmid',np.nan),
                        '_'.join(['pmcid',suffix]):citation.get('pmcid',np.nan),
                        '_'.join(['arxiv_id',suffix]):citation.get('arxiv_id',np.nan),
                        '_'.join(['ark',suffix]):citation.get('ark',np.nan),
                        '_'.join(['istex_id',suffix]):citation.get('istex_id',np.nan),
                        '_'.join(['url',suffix]):citation.get('url',np.nan)}
            lista_citations.append(dict_cit)
        return lista_citations
    return default_dict


def get_citation_authors(citation, suffix='citation'):
    
    """Get authors information from one only citation in the article documment"""

    default_fict = {'id':citation.get('id',np.nan),
                    'index':citation.get('index',np.nan),
                    'pdf_md5':doc.get('pdf_md5',np.nan),
                    '_'.join(['full_name',suffix]):np.nan,
                    '_'.join(['given_name',suffix]):np.nan,
                    '_'.join(['middle_name',suffix]):np.nan,
                    '_'.join(['surname',suffix]):np.nan,
                    '_'.join(['email',suffix]):np.nan,
                    '_'.join(['orcid',suffix]):np.nan,
                    '_'.join(['institution',suffix]):np.nan,
                    '_'.join(['department',suffix]):np.nan,
                    '_'.join(['laboratory',suffix]):np.nan,
                    '_'.join(['addr_line',suffix]):np.nan,
                    '_'.join(['post_code',suffix]):np.nan,
                    '_'.join(['settlement',suffix]):np.nan,
                    '_'.join(['country',suffix]):np.nan}
    
    authors = citation.get('authors',[])
    if len(authors):
        lista_authors = []
        for author in authors:
            affiliation = author.get('affiliation',np.nan)
            address = affiliation.get('address',np.nan) if not pd.isna(affiliation) else np.nan
            dict_authors = {'id':citation.get('id',np.nan),
                            'index':citation.get('index',np.nan),
                            'pdf_md5':doc.get('pdf_md5',np.nan),
                            '_'.join(['full_name',suffix]):author.get('full_name',np.nan),
                            '_'.join(['given_name',suffix]):author.get('given_name',np.nan),
                            '_'.join(['middle_name',suffix]):author.get('middle_name',np.nan),
                            '_'.join(['surname',suffix]):author.get('surname',np.nan),
                            '_'.join(['email',suffix]):author.get('email',np.nan),
                            '_'.join(['orcid',suffix]):author.get('orcid',np.nan),
                            '_'.join(['institution',suffix]):affiliation.get('institution',np.nan) if not pd.isna(affiliation) else np.nan,
                            '_'.join(['department',suffix]):affiliation.get('department',np.nan) if not pd.isna(affiliation) else np.nan,
                            '_'.join(['laboratory',suffix]):affiliation.get('laboratory',np.nan) if not pd.isna(affiliation) else np.nan,
                            '_'.join(['addr_line',suffix]):address.get('addr_line',np.nan) if not pd.isna(address) else np.nan,
                            '_'.join(['post_code',suffix]):address.get('post_code',np.nan) if not pd.isna(address) else np.nan,
                            '_'.join(['settlement',suffix]):address.get('settlement',np.nan) if not pd.isna(address) else np.nan,
                            '_'.join(['country',suffix]):address.get('country',np.nan) if not pd.isna(address) else np.nan}
            lista_authors.append(dict_authors)
        return lista_authors
    return [default_fict]


def get_citations_authors(doc, suffix='citation'):
    
    """Get authors information from all the citations in the article documment"""

    default_fict = {'id':np.nan,
                    'index':np.nan,
                    'pdf_md5':doc.get('pdf_md5',np.nan),
                    '_'.join(['full_name',suffix]):np.nan,
                    '_'.join(['given_name',suffix]):np.nan,
                    '_'.join(['middle_name',suffix]):np.nan,
                    '_'.join(['surname',suffix]):np.nan,
                    '_'.join(['email',suffix]):np.nan,
                    '_'.join(['orcid',suffix]):np.nan,
                    '_'.join(['institution',suffix]):np.nan,
                    '_'.join(['department',suffix]):np.nan,
                    '_'.join(['laboratory',suffix]):np.nan,
                    '_'.join(['addr_line',suffix]):np.nan,
                    '_'.join(['post_code',suffix]):np.nan,
                    '_'.join(['settlement',suffix]):np.nan,
                    '_'.join(['country',suffix]):np.nan}
    
    citations = doc.get('citations',[])
    if len(citations):
        lista_citations_authors = []
        for citation in citations:
            lista_citations_authors += get_citation_authors(citation)
        return lista_citations_authors
    return default_fict


def 

In [145]:
df_doc_info = pd.DataFrame(get_doc(doc))
df_doc_info.head()

Unnamed: 0,grobid_version,grobid_timestamp,pdf_md5,language_code,acknowledgement,abstract,body,annex
0,0.7.0,2022-04-23 16:46:00,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,en,Acknowledgements This work was carried out wit...,This article surveys and highlights the integr...,Introduction It is foreseeable that artificial...,


In [147]:
df_doc_head = pd.DataFrame(get_head(doc))
df_doc_head.head()

Unnamed: 0,pdf_md5,index_head,id_head,unstructured_head,date_head,title_head,book_title_head,series_title_head,journal_head,journal_abbrev_head,...,first_page_head,last_page_head,note_head,doi_head,pmid_head,pmcid_head,arxiv_id_head,ark_head,istex_id_head,url_head
0,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,,,,2007-02-08,Nanometer scale carbon structures for charge-t...,,,,,...,,,,10.1039/b617684b,,,,,,


In [148]:
df_doc_authors = pd.DataFrame(get_authors(doc))
df_doc_authors.head()

Unnamed: 0,pdf_md5,full_name_author,given_name_author,middle_name_author,surname_author,email_author,orcid_author,institution_author,department_author,laboratory_author,addr_line_author,post_code_author,settlement_author,country_author
0,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,Dirk M Guldi,Dirk,M,Guldi,,,Universita¨t Erlangen,Institute of Physical and Theoretical Chemistry,,,91058,Erlangen,Germany


In [150]:
df_doc_citations = pd.DataFrame(get_citations(doc))
df_doc_citations.head()

Unnamed: 0,pdf_md5,index_citation,id_citation,unstructured_citation,date_citation,title_citation,book_title_citation,series_title_citation,journal_citation,journal_abbrev_citation,...,first_page_citation,last_page_citation,note_citation,doi_citation,pmid_citation,pmcid_citation,arxiv_id_citation,ark_citation,istex_id_citation,url_citation
0,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,0,b0,Basic Research Needs for Solar Energy Utilizat...,2005,Basic Research Needs for Solar Energy Utilization,,,Report of the Basic Energy Sciences Workshop o...,,...,,,,,,,,,,http://www.sc.doe.gov/bes/reports/files/SEU_rp...
1,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,1,b1,"V. Balzani, Electron Transfer in Chemistry, Wi...",2001,Electron Transfer in Chemistry,,,Weinheim,,...,,,,,,,,,,
2,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,2,b2,"A. Hirsch and M. Brettreich, Fullerenes: Chemi...",2000,Fullerenes: From Synthesis to Optoelectronic P...,Nuclear and Radiation Chemical Approaches to F...,,,,...,,,"Fullerenes: Chemistry, Physics, and Technology",,,,,,,
3,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,3,b3,"F. Diederich, L. Isaacs and D. Philp, Chem. So...",1994,,,,Compt. Rend. Chim,,...,,,Chem. Soc. Rev.,,,,,,,
4,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,4,b4,"S. Reich, C. Thomsen and J. Maultzsch, Carbon ...",2004,Carbon Nanotubes: Basic Concepts and Physical ...,,,,,...,,,,,,,,,,


In [149]:
df_doc_authors_citations = pd.DataFrame(get_citations_authors(doc))
df_doc_authors_citations.head()

Unnamed: 0,id,index,pdf_md5,full_name_citation,given_name_citation,middle_name_citation,surname_citation,email_citation,orcid_citation,institution_citation,department_citation,laboratory_citation,addr_line_citation,post_code_citation,settlement_citation,country_citation
0,b0,0,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,,,,,,,,,,,,,
1,b1,1,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,V Balzani,V,,Balzani,,,,,,,,,
2,b2,2,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,A Hirsch,A,,Hirsch,,,,,,,,,
3,b2,2,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,M Brettreich,M,,Brettreich,,,,,,,,,
4,b2,2,5B70A65B9D2AEC3DD3E6A64B4BD94CB2,: Fullerenes,:,,Fullerenes,,,,,,,,,
