# Features de TextMining:

---------------------------------

In [1]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pyvis.network import Network
import nltk

import random

!pip install markupsafe==2.0.1

In [2]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

### Definindo variáveis e caminhos

In [3]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Funções para execução em batch

In [4]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [5]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors

In [6]:
input_folder_path = r"""C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [7]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 23 s


---------------------------------

In [8]:
def getColumnsWithData(df, return_percent=False, n_round=2):
    
    """"""
    
    list_col_with_data = []
    for col in df.columns.tolist():
        rows = df[col].shape
        n_null = df[col].isnull().sum()
        not_null_data_perc = (1-n_null/rows)
        if not_null_data_perc:
            if return_percent:
                list_col_with_data.append((col,np.round(not_null_data_perc, n_round)))
            list_col_with_data.append(col)
            
    return list_col_with_data     

In [9]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

### df_doc_info

In [10]:
100*(1-dict_dfs['df_doc_info'].isnull().sum()/dict_dfs['df_doc_info'].shape[0])

grobid_version      100.000000
grobid_timestamp    100.000000
pdf_md5             100.000000
language_code       100.000000
acknowledgement      78.313253
abstract             97.074010
body                100.000000
annex                 0.000000
file                100.000000
status              100.000000
raw_data            100.000000
dtype: float64

In [11]:
df_doc_info = dict_dfs['df_doc_info'].loc[:,getColumnsWithData(dict_dfs['df_doc_info'])]

In [12]:
df_doc_info.head()

Unnamed: 0_level_0,grobid_version,grobid_timestamp,pdf_md5,language_code,acknowledgement,abstract,body,file,status,raw_data
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.7.0,2022-05-15 02:11:00,915208F947D9A8C5F93F958AF4435A39,en,Acknowledgments This work was supported by the...,The limitation of traditional Von Neumann arch...,Introduction Traditional computing system has ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
2,0.7.0,2022-05-15 02:09:00,BD5CAA459269F9DDAEED89D25B28B4C8,en,,For successful applications of machine learnin...,"Introduction To save time and human efforts, m...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
3,0.7.0,2022-05-15 02:10:00,A8D795899D1DE7A3D9CDA47F45F8885C,en,Acknowledgements XW would like to thank Yuanch...,The generalized stacking fault energies (GSFE)...,Introduction Refractory metals are metals with...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
4,0.7.0,2022-05-15 02:09:00,A89985654AE252596EE14410E0DA1F9B,en,Acknowledgements We want to acknowledge CSC-Sc...,"Despite their simplicity, the concepts of oxid...","Introduction The concept of valency, formulate...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
5,0.7.0,2022-05-15 02:09:00,82D64FDF1966AEB1230F90AF465E41BF,en,Acknowledgements,This paper concerns on the aluminum foam mater...,"Introduction As energy crisis intensified, the...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


### df_doc_head

In [13]:
100*(1-dict_dfs['df_doc_head'].isnull().sum()/dict_dfs['df_doc_head'].shape[0])

index_head              0.000000
id_head                 0.000000
unstructured_head       0.000000
date_head              46.299484
title_head             98.278830
book_title_head         0.000000
series_title_head       0.000000
journal_head            0.000000
journal_abbrev_head     0.000000
publisher_head          0.000000
institution_head        0.000000
issn_head               0.688468
eissn_head              0.000000
volume_head             0.000000
issue_head              0.000000
pages_head              0.000000
first_page_head         0.000000
last_page_head          0.000000
note_head               0.000000
doi_head               97.418244
pmid_head               0.000000
pmcid_head              0.000000
arxiv_id_head           0.172117
ark_head                0.000000
istex_id_head           0.000000
url_head                0.000000
dtype: float64

In [14]:
dict_dfs['df_doc_head'].dtypes

index_head             float64
id_head                float64
unstructured_head      float64
date_head               object
title_head              object
book_title_head        float64
series_title_head      float64
journal_head           float64
journal_abbrev_head    float64
publisher_head         float64
institution_head       float64
issn_head               object
eissn_head             float64
volume_head            float64
issue_head             float64
pages_head             float64
first_page_head        float64
last_page_head         float64
note_head              float64
doi_head                object
pmid_head              float64
pmcid_head             float64
arxiv_id_head           object
ark_head               float64
istex_id_head          float64
url_head               float64
dtype: object

In [15]:
df_doc_head = dict_dfs['df_doc_head'].loc[:,getColumnsWithData(dict_dfs['df_doc_head'])]

In [16]:
df_doc_head

Unnamed: 0_level_0,date_head,title_head,issn_head,doi_head,arxiv_id_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2021-08-06,Linearity improvement of HfO x -based memristo...,,10.1016/j.mssp.2021.106131,
2,2021-01-29,Transfer learning for materials informatics us...,,10.1016/j.commatsci.2021.110314,
3,2021-03-02,Generalized stacking fault energies and Peierl...,,10.1016/j.commatsci.2021.110364,
4,2019-03-04,Computational Materials Science,,10.1016/j.commatsci.2019.01.046,
5,2013-04-09,Identification of material parameters for alum...,,10.1016/j.commatsci.2013.02.024,
...,...,...,...,...,...
583,2010-03-15,Prediction of the hot deformation behavior for...,,10.1016/j.commatsci.2010.02.031,
584,,Journal Pre-proof Interactive-quantum-chemical...,,10.1016/j.polymer.2020.122738,
585,2015-04-05,A c c e p t e d M a n u s c r i p t An impleme...,,10.1016/j.cplett.2015.04.019,
586,2014-03-19,Study on the hot deformation behavior of TC4-D...,,10.1016/j.msea.2014.03.047,


In [17]:
df_doc_info_head = df_doc_info.join(df_doc_head, how='left')

In [18]:
df_doc_info_head.head(5)

Unnamed: 0_level_0,grobid_version,grobid_timestamp,pdf_md5,language_code,acknowledgement,abstract,body,file,status,raw_data,date_head,title_head,issn_head,doi_head,arxiv_id_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.7.0,2022-05-15 02:11:00,915208F947D9A8C5F93F958AF4435A39,en,Acknowledgments This work was supported by the...,The limitation of traditional Von Neumann arch...,Introduction Traditional computing system has ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2021-08-06,Linearity improvement of HfO x -based memristo...,,10.1016/j.mssp.2021.106131,
2,0.7.0,2022-05-15 02:09:00,BD5CAA459269F9DDAEED89D25B28B4C8,en,,For successful applications of machine learnin...,"Introduction To save time and human efforts, m...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2021-01-29,Transfer learning for materials informatics us...,,10.1016/j.commatsci.2021.110314,
3,0.7.0,2022-05-15 02:10:00,A8D795899D1DE7A3D9CDA47F45F8885C,en,Acknowledgements XW would like to thank Yuanch...,The generalized stacking fault energies (GSFE)...,Introduction Refractory metals are metals with...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2021-03-02,Generalized stacking fault energies and Peierl...,,10.1016/j.commatsci.2021.110364,
4,0.7.0,2022-05-15 02:09:00,A89985654AE252596EE14410E0DA1F9B,en,Acknowledgements We want to acknowledge CSC-Sc...,"Despite their simplicity, the concepts of oxid...","Introduction The concept of valency, formulate...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2019-03-04,Computational Materials Science,,10.1016/j.commatsci.2019.01.046,
5,0.7.0,2022-05-15 02:09:00,82D64FDF1966AEB1230F90AF465E41BF,en,Acknowledgements,This paper concerns on the aluminum foam mater...,"Introduction As energy crisis intensified, the...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2013-04-09,Identification of material parameters for alum...,,10.1016/j.commatsci.2013.02.024,


In [19]:
df_doc_info_head.date_head = df_doc_info_head.date_head.apply(lambda e: pd.to_datetime(e))

In [20]:
df_doc_info_head['year'] = df_doc_info_head.date_head.apply(lambda e: e if pd.isna(e) else int(e.year))

In [21]:
df_doc_info_head.date_head.value_counts(dropna=False)

NaT           312
2021-08-02      4
2021-03-02      3
2021-03-10      3
2021-04-23      3
             ... 
2020-04-27      1
2017-11-06      1
2011-02-17      1
2020-05-05      1
2015-04-05      1
Name: date_head, Length: 240, dtype: int64

In [22]:
df_doc_info_head.year.value_counts(dropna=False).to_dict()

{NaT: 312,
 2021: 109,
 2020: 45,
 2019: 25,
 2011: 13,
 2012: 12,
 2008: 11,
 2010: 10,
 2009: 9,
 2015: 8,
 2018: 7,
 2013: 5,
 2017: 5,
 2016: 4,
 2014: 2,
 2007: 1,
 2000: 1,
 2005: 1,
 2006: 1}

In [23]:
pd.DataFrame(df_doc_info_head.year.value_counts(dropna=False).to_dict(), columns=['year','count'])

Unnamed: 0,year,count


In [24]:
import plotly.express as px

df_doc_info_head.year = df_doc_info_head.year.fillna('Null Value')

fig = px.pie(df_doc_info_head, 
             values='year', 
             names='year',
             title='Number of Articles by Year',
             hover_data=['year'], 
             labels={'values':'Percentage','year':'Year of Article'}, hole=.5)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

### df_doc_authors

In [25]:
100*(1-dict_dfs['df_doc_authors'].isnull().sum()/dict_dfs['df_doc_authors'].shape[0])

full_name_author      99.701715
given_name_author     98.322148
middle_name_author    24.086503
surname_author        99.701715
email_author          21.439224
orcid_author           3.094705
institution_author    82.662192
department_author     72.073080
laboratory_author     13.534676
addr_line_author      20.208799
post_code_author      66.853095
settlement_author     78.225205
country_author        85.160328
dtype: float64

In [26]:
df_doc_authors = dict_dfs['df_doc_authors'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors'])]

In [27]:
df_doc_authors.head()

Unnamed: 0_level_0,full_name_author,given_name_author,middle_name_author,surname_author,email_author,orcid_author,institution_author,department_author,laboratory_author,addr_line_author,post_code_author,settlement_author,country_author
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Heesoo Park,Heesoo,,Park,hpark@hbku.edu.qa,,Hamad Bin Khalifa University,,,,,Doha,Qatar
1,Raghvendra Mall,Raghvendra,,Mall,,,Hamad Bin Khalifa University,,,,,Doha,Qatar
1,Adnan Ali,Adnan,,Ali,,,Hamad Bin Khalifa University,,,,,Doha,Qatar
1,Stefano Sanvito,Stefano,,Sanvito,,,Trinity College,"School of Physics, AMBER and CRANN Institute",,Dublin 2,,,Ireland
1,Halima Bensmail,Halima,,Bensmail,,,Hamad Bin Khalifa University,,,,,Doha,Qatar


In [28]:
columns_select = ['country_author','settlement_author']
df_sun_agg = df_doc_authors.groupby(by=columns_select, as_index=False, dropna=True)['full_name_author'].count()

In [29]:
df_sun_agg = df_sun_agg.fillna("")
df_sun_agg.rename(columns={'country_author':'Author Country',
                           'settlement_author':'Author Settlement',
                           'full_name_author':'Number of Authors'},
                  inplace=True)

In [30]:
df_sun_agg.head()

Unnamed: 0,Author Country,Author Settlement,Number of Authors
0,Algeria,Tlemcen,2
1,Argentina,Bahía Blanca,4
2,Argentina,Buenos Aires,5
3,Argentina,La Plata,1
4,Argentina,Salta,1


In [31]:
import plotly.express as px
fig = px.sunburst(df_sun_agg, 
                  path=['Author Country',
                        'Author Settlement'],
                  values='Number of Authors')
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [32]:
df_doc_authors.full_name_author.value_counts(dropna=False)

A R T I C L E I N F O    10
Acta Materialia           8
NaN                       8
Yu Sun                    6
Dane Morgan               6
                         ..
Y X Li                    1
X X Wang                  1
G Qin                     1
P F Gao                   1
J H Chen                  1
Name: full_name_author, Length: 2324, dtype: int64

In [33]:
list_delete_authors = ['A R T I C L E I N F O', np.nan, 'Null', 'NaN','nan', 'null', '', ' ']
filter_delete_authors = ~(df_doc_authors.full_name_author.isin(list_delete_authors))
df_doc_authors = df_doc_authors.loc[filter_delete_authors].copy()

In [34]:
top_authors = df_doc_authors.full_name_author.value_counts()
df_top_authors = pd.DataFrame({'Full Name':top_authors.index,
                               'Number of Articles':top_authors.values.tolist()})

df_top_authors

Unnamed: 0,Full Name,Number of Articles
0,Acta Materialia,8
1,N S Reddy,6
2,Weidong Zeng,6
3,Yu Sun,6
4,Dane Morgan,6
...,...,...
2317,Y X Li,1
2318,X X Wang,1
2319,G Qin,1
2320,P F Gao,1


In [35]:
import plotly.express as px

top_authors = df_top_authors.nlargest(20,'Number of Articles')
top_authors = top_authors.sort_values('Number of Articles',ascending=True)
fig = px.bar(top_authors,
             y='Full Name',
             x='Number of Articles',
             color='Number of Articles',
             width=600,
             height=700,text='Number of Articles')
fig.update(layout_coloraxis_showscale=False)
# fig.update_traces(showlegend=False)
# fig.update_traces(marker_showscale=False)
fig.update_xaxes(visible=False)
fig.update_layout(yaxis_title=None, xaxis_title=None)
fig.show()

### df_doc_authors_citations

In [36]:
100*(1-dict_dfs['df_doc_authors_citations'].isnull().sum()/dict_dfs['df_doc_authors_citations'].shape[0])

id                      100.000000
index                   100.000000
full_name_citation       99.371668
given_name_citation      98.339759
middle_name_citation     30.893282
surname_citation         98.659667
email_citation            0.000000
orcid_citation            0.000000
institution_citation      0.000000
department_citation       0.000000
laboratory_citation       0.000000
addr_line_citation        0.000000
post_code_citation        0.000000
settlement_citation       0.000000
country_citation          0.000000
dtype: float64

In [37]:
df_doc_authors_citations = dict_dfs['df_doc_authors_citations'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors_citations'])]

In [38]:
df_doc_authors_citations

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,b0,0,D B Mitzi,D,B,Mitzi
1,b0,0,C D Dimitrakopoulos,C,D,Dimitrakopoulos
1,b0,0,L L Kosbar,L,L,Kosbar
1,b1,1,H.-S Kim,H.-S,,Kim
1,b1,1,C.-R Lee,C.-R,,Lee
...,...,...,...,...,...,...
587,b42,42,P D Nellist,P,D,Nellist
587,b42,42,S J Pennycook,S,J,Pennycook
587,b43,43,J Frank,J,,Frank
587,b44,44,J H Chen,J,H,Chen


In [39]:
df_doc_authors_citations.full_name_citation.value_counts(dropna=False)

NaN            766
J Behler       374
Y Zhang        339
R Ramprasad    318
G Ceder        277
              ... 
T Gabel          1
L Bisbee         1
C Jewett         1
O Harling        1
J Radon          1
Name: full_name_citation, Length: 46790, dtype: int64

In [40]:
path_geo = os.path.join(path,'data','external')

In [41]:
shapes_geometry = pd.read_csv(os.path.join(path_geo,'shapes_geometry.csv'), sep=';', decimal='.')
shapes_correct = pd.read_csv(os.path.join(path_geo,'shapes_correct.csv'), encoding='latin-1',sep=';', decimal='.')

In [42]:
df_country_agg = df_doc_authors.groupby(by=['country_author'], as_index=False, dropna=True)['full_name_author'].count()

dictCorrectShapes = {e[0]:e[1] for e in zip(shapes_correct.convert,shapes_correct.name)}

df_country_agg.country_author = df_country_agg.country_author.apply(lambda e: dictCorrectShapes.get(e,e))

In [43]:
df_country_agg.head()

Unnamed: 0,country_author,full_name_author
0,Algeria,6
1,Argentina,14
2,Australia,47
3,Austria,14
4,Bangladesh,3


In [44]:
shapes_geometry.head()

Unnamed: 0,continent,name,iso_a3,geometry
0,Oceania,Fiji,FJI,"MULTIPOLYGON (((180 -16.067132663642447, 180 -..."
1,Africa,Tanzania,TZA,POLYGON ((33.90371119710453 -0.950000000000000...
2,Africa,W. Sahara,ESH,POLYGON ((-8.665589565454809 27.65642588959235...
3,North America,Canada,CAN,MULTIPOLYGON (((-122.84000000000003 49.0000000...
4,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...


In [45]:
df_geo_plot = shapes_geometry.merge(df_country_agg, left_on='name', right_on='country_author')

In [46]:
df_geo_plot.head()

Unnamed: 0,continent,name,iso_a3,geometry,country_author,full_name_author
0,North America,Canada,CAN,MULTIPOLYGON (((-122.84000000000003 49.0000000...,Canada,30
1,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,489
2,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,159
3,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,18
4,South America,Argentina,ARG,MULTIPOLYGON (((-68.63401022758323 -52.6363704...,Argentina,14


In [47]:
df_geo_plot.full_name_author = df_geo_plot.full_name_author.fillna(0)

df_geo_plot = df_geo_plot.reset_index().set_index('index')
df_geo_plot.geometry.to_json()


In [224]:
df_geo_plot.rename(columns={'full_name_author':'Number of Authors','name':'Country'}, inplace=True)

df_geo_plot.geometry.iat[0]

In [255]:
import folium

my_map = folium.Map()
# Add the data
folium.Choropleth(
    geo_data=pd.read_json(os.path.join(path_geo,'geometry.json')).to_json(),
    name='choropleth',
    data=df_geo_plot,
    columns=['name'],
    key_on='feature.properties.id',
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Authors by Location'
).add_to(my_map)
my_map.save('meat.html')

IndexError: list index out of range

In [49]:
!conda install geopandas

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Solving environment: ...working... 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
failed



Building graph of deps:   0%|          | 0/310 [00:00<?, ?it/s]
Examining threadpoolctl:   0%|          | 0/310 [00:00<?, ?it/s]
Examining babel:   0%|          | 1/310 [00:00<00:09, 32.73it/s]
Examining anaconda-navigator:   1%|          | 2/310 [00:00<00:15, 19.89it/s]
Examining anaconda-navigator:   1%|          | 3/310 [00:00<00:10, 29.84it/s]
Examining cloudpickle:   1%|          | 3/310 [00:04<00:10, 29.84it/s]       
Examining cloudpickle:   1%|▏         | 4/310 [00:04<06:35,  1.29s/it]
Examining requests:   1%|▏         | 4/310 [00:04<06:35,  1.29s/it]   
Examining requests:   2%|▏         | 5/310 [00:04<04:48,  1.06it/s]
Examining zope.interface:   2%|▏         | 5/310 [00:04<04:48,  1.06it/s]
Examining zope.interface:   2%|▏         | 6/310 [00:04<03:54,  1.30it/s]
Examining python-jsonrpc-server:   2%|▏         | 6/310 [00:05<03:54,  1.30it/s]
Examining python-jsonrpc-server:   2%|▏         | 7/310 [00:05<03:14,  1.56it/s]
Examining vs2015_runtime:   2%|▏         | 7/310 [0

https://stackoverflow.com/questions/43587960/gdal-installation-error-using-pip

In [50]:
import geopandas

ModuleNotFoundError: No module named 'geopandas'

In [262]:
!pip install fiona

Collecting fiona
  Using cached Fiona-1.8.21.tar.gz (1.0 MB)


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\vierb\anaconda3\python.exe' -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\vierb\\AppData\\Local\\Temp\\pip-install-c8m3jwnd\\fiona\\setup.py'"'"'; __file__='"'"'C:\\Users\\vierb\\AppData\\Local\\Temp\\pip-install-c8m3jwnd\\fiona\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\vierb\AppData\Local\Temp\pip-pip-egg-info-weqxohpt'
         cwd: C:\Users\vierb\AppData\Local\Temp\pip-install-c8m3jwnd\fiona\
    Complete output (1 lines):
    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
ERROR: Command errored out with exit status 1: python setup.py egg_info Check the logs for full command out

In [None]:
pip install wheel
pip install pipwin

pipwin install numpy
pipwin install pandas
pipwin install shapely
pipwin install gdal
pipwin install fiona
pipwin install pyproj
pipwin install six
pipwin install rtree
pipwin install geopandas

In [2]:
pip install r"C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\notebooks\GDAL-3.4.3-pp38-pypy38_pp73-win_amd64.whl"

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an EnvironmentError: Bad path: c:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\notebooks\rC:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\notebooks\GDAL-3.4.3-pp38-pypy38_pp73-win_amd64.whl



In [3]:
!conda install -c conda-forge gdal

In [33]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

In [34]:
    df_doc_info = dict_dfs['df_doc_info'].loc[:,getColumnsWithData(dict_dfs['df_doc_info'])]
    df_doc_head = dict_dfs['df_doc_head'].loc[:,getColumnsWithData(dict_dfs['df_doc_head'])]
    df_doc_authors = dict_dfs['df_doc_authors'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors'])]
    df_doc_authors_citations = dict_dfs['df_doc_authors_citations'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors_citations'])]

In [27]:
df_doc_info.head()

Unnamed: 0_level_0,grobid_version,grobid_timestamp,pdf_md5,language_code,acknowledgement,abstract,body,file,status,raw_data
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.7.0,2022-05-15 02:11:00,915208F947D9A8C5F93F958AF4435A39,en,Acknowledgments This work was supported by the...,The limitation of traditional Von Neumann arch...,Introduction Traditional computing system has ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
2,0.7.0,2022-05-15 02:09:00,BD5CAA459269F9DDAEED89D25B28B4C8,en,,For successful applications of machine learnin...,"Introduction To save time and human efforts, m...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
3,0.7.0,2022-05-15 02:10:00,A8D795899D1DE7A3D9CDA47F45F8885C,en,Acknowledgements XW would like to thank Yuanch...,The generalized stacking fault energies (GSFE)...,Introduction Refractory metals are metals with...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
4,0.7.0,2022-05-15 02:09:00,A89985654AE252596EE14410E0DA1F9B,en,Acknowledgements We want to acknowledge CSC-Sc...,"Despite their simplicity, the concepts of oxid...","Introduction The concept of valency, formulate...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
5,0.7.0,2022-05-15 02:09:00,82D64FDF1966AEB1230F90AF465E41BF,en,Acknowledgements,This paper concerns on the aluminum foam mater...,"Introduction As energy crisis intensified, the...",C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


In [28]:
df_doc_info.isnull().sum()

grobid_version        0
grobid_timestamp      0
pdf_md5               0
language_code         0
acknowledgement     126
abstract             17
body                  0
file                  0
status                0
raw_data              0
dtype: int64

In [29]:
df_doc_head.head()

Unnamed: 0_level_0,date_head,title_head,issn_head,doi_head,arxiv_id_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2021-08-06,Linearity improvement of HfO x -based memristo...,,10.1016/j.mssp.2021.106131,
2,2021-01-29,Transfer learning for materials informatics us...,,10.1016/j.commatsci.2021.110314,
3,2021-03-02,Generalized stacking fault energies and Peierl...,,10.1016/j.commatsci.2021.110364,
4,2019-03-04,Computational Materials Science,,10.1016/j.commatsci.2019.01.046,
5,2013-04-09,Identification of material parameters for alum...,,10.1016/j.commatsci.2013.02.024,


In [30]:
df_doc_head.isnull().sum()

date_head        312
title_head        10
issn_head        577
doi_head          15
arxiv_id_head    580
dtype: int64

In [31]:
df_doc_authors.head()

Unnamed: 0_level_0,full_name_author,given_name_author,middle_name_author,surname_author,email_author,orcid_author,institution_author,department_author,laboratory_author,addr_line_author,post_code_author,settlement_author,country_author
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Yutong Jiang,Yutong,,Jiang,,,Tianjin University of Technology,School of Electrical and Electronic Engineering,Tianjin Key Laboratory of Film Electronic and ...,,300384.0,Tianjin,PR China
1,Kailiang Zhang,Kailiang,,Zhang,kailiang_zhang@tjut.edu.cn,,Tianjin University of Technology,School of Electrical and Electronic Engineering,Tianjin Key Laboratory of Film Electronic and ...,,300384.0,Tianjin,PR China
1,Kai Hu,Kai,,Hu,,,Tianjin University of Technology,School of Electrical and Electronic Engineering,Tianjin Key Laboratory of Film Electronic and ...,,300384.0,Tianjin,PR China
1,Yujian Zhang,Yujian,,Zhang,,,,,,,,,
1,Ange Liang,Ange,,Liang,,,Tianjin University of Technology,School of Electrical and Electronic Engineering,Tianjin Key Laboratory of Film Electronic and ...,,300384.0,Tianjin,PR China


In [37]:
df_doc_authors.isnull().sum()

full_name_author         8
given_name_author       45
middle_name_author    2036
surname_author           8
email_author          2107
orcid_author          2599
institution_author     465
department_author      749
laboratory_author     2319
addr_line_author      2140
post_code_author       889
settlement_author      584
country_author         398
dtype: int64

In [35]:
df_doc_authors_citations.head()

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,b0,0,H L Park,H,L,Park
1,b0,0,Y Lee,Y,,Lee
1,b0,0,N Kim,N,,Kim
1,b0,0,D G Seo,D,G,Seo
1,b0,0,G T Go,G,T,Go


In [38]:
df_doc_authors_citations.isnull().sum()

id                          0
index                       0
full_name_citation        766
given_name_citation      2024
middle_name_citation    84248
surname_citation         1634
dtype: int64

In [39]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

In [41]:
dict_dfs['df_doc_authors_citations'].head()

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation,email_citation,orcid_citation,institution_citation,department_citation,laboratory_citation,addr_line_citation,post_code_citation,settlement_citation,country_citation
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,b0,0,H L Park,H,L,Park,,,,,,,,,
1,b0,0,Y Lee,Y,,Lee,,,,,,,,,
1,b0,0,N Kim,N,,Kim,,,,,,,,,
1,b0,0,D G Seo,D,G,Seo,,,,,,,,,
1,b0,0,G T Go,G,T,Go,,,,,,,,,


In [42]:
dict_dfs['df_doc_authors_citations'].head().T

article_id,1,1.1,1.2,1.3,1.4
id,b0,b0,b0,b0,b0
index,0,0,0,0,0
full_name_citation,H L Park,Y Lee,N Kim,D G Seo,G T Go
given_name_citation,H,Y,N,D,G
middle_name_citation,L,,,G,T
surname_citation,Park,Lee,Kim,Seo,Go
email_citation,,,,,
orcid_citation,,,,,
institution_citation,,,,,
department_citation,,,,,
