# Features de TextMining:

---------------------------------

In [2]:
import os
import sys
import re
import datetime
import dateutil

sys.path.insert(0,os.path.dirname(os.getcwd()))
sys.path.insert(0,os.path.join(os.getcwd(),'grobid'))
sys.path.insert(0,os.getcwd())

import numpy as np
import pandas as pd

from grobid import grobid_client
import grobid_tei_xml
from grobid_to_dataframe import grobid_cli, xmltei_to_dataframe

import plotly

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from pyvis.network import Network
import nltk

import random

!pip install markupsafe==2.0.1

In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

---------------------------------

### Comandos Docker

docker run -t --rm --init -p 8080:8070 -p 8081:8071 --memory="9g" lfoppiano/grobid:0.7.0

docker run -t --rm --init -p 8080:8070 -p 8081:8071 lfoppiano/grobid:0.6.2

### Definindo variáveis e caminhos

In [4]:
path = os.path.dirname(os.getcwd())
path_input = os.path.join(path,'artifacts','articles','ml_material','teste')

---------------------------------

### Funções para execução em batch

In [5]:
def get_path(path_input_path):
    """"""
    if os.path.exists(path_input_path):
        return path_input_path
    
    return os.getcwd()


def batch_process_path(path_input_path, n_workers=2,
                       check_cache=True,
                       cache_folder_name='summarticles_cache',
                       config_path="./grobid/config.json"):
    
    """"""
    
    gcli = grobid_cli(config_path=config_path)
    result_batch = gcli.process_pdfs(input_path=path_input_path,
                                     check_cache=check_cache,
                                     cache_folder_name=cache_folder_name,
                                     n_workers=n_workers,
                                     service="processFulltextDocument",
                                     generateIDs=True,
                                     include_raw_citations=True,
                                     include_raw_affiliations=True,
                                     consolidate_header=False,
                                     consolidate_citations=False,
                                     tei_coordinates=False,
                                     segment_sentences=True,
                                     verbose=True)
    return result_batch


def get_dataframes(result_batch):
    
    """"""
    
    xml_to_df = xmltei_to_dataframe()
    dict_dfs, dic_errors = xml_to_df.get_dataframe_articles(result_batch)
    
    return dict_dfs, dic_errors


def files_path(path):
    list_dir = os.listdir(path)
    files = []
    for file in list_dir:
        if os.path.isfile(os.path.join(path,file)):
            files.append(os.path.join(path,file))
    return files

In [6]:
def run_batch_process(path_input, n_workers=6, check_cache=True, 
                      cache_folder_name='summarticles_cache', 
                      config_path="./grobid/config.json"):

    dict_exec = {'path':path_input}
    dict_exec['start_datetime'] = datetime.datetime.now()
    
    # path_input = os.path.join(path,'artifacts','test_article')
    config_path = os.path.join(os.getcwd(),'grobid','config.json')
    dict_exec['grobid_config'] = config_path
    
    gcli = grobid_client.GrobidClient(config_path=config_path, check_server=False)
    
    dict_exec['files'] = gcli.get_input_files(path_input)
    dict_exec['num_files'] = len(dict_exec['files'])
    dict_exec['n_workers'] = n_workers
    
    path_input_path = get_path(path_input)
    result_batch = batch_process_path(path_input_path, n_workers=dict_exec['n_workers'], check_cache=check_cache)
    dict_dfs, dic_errors = get_dataframes(result_batch)
    
    gcli.save_xmltei_files(result_batch, input_folder_path, cache_folder_name=cache_folder_name)
    
    dict_exec['end_datetime'] = datetime.datetime.now()
    dict_exec['time_exec_sec'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    dict_exec['time_exec_min'] = (dict_exec['end_datetime']-dict_exec['start_datetime']).seconds
    
    return dict_dfs, dict_exec, dic_errors

In [7]:
input_folder_path = r"""C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\artifacts\articles\ml_material"""

In [8]:
%%time
dict_dfs, dict_exec, dic_errors = run_batch_process(path_input=input_folder_path, 
                                                    n_workers=10, 
                                                    check_cache=True, 
                                                    cache_folder_name='summarticles_cache', 
                                                    config_path="./grobid/config.json")

GROBID server is up and running
587 files to process in current batch
[Input Files] 587
[Cache Files] 587
In the end, we have: 0  new files to process!
And we have : 587  files to back from cache!
Processed articles: 581
Number articles with errors: 6
Wall time: 22.6 s


---------------------------------

In [27]:
def getColumnsWithData(df, return_percent=False, n_round=2):
    
    """"""
    
    list_col_with_data = []
    for col in df.columns.tolist():
        rows = df[col].shape
        n_null = df[col].isnull().sum()
        not_null_data_perc = (1-n_null/rows)
        if not_null_data_perc:
            if return_percent:
                list_col_with_data.append((col,np.round(not_null_data_perc, n_round)))
            list_col_with_data.append(col)
            
    return list_col_with_data     

In [9]:
dict_dfs.keys()

dict_keys(['df_doc_info', 'df_doc_head', 'df_doc_authors', 'df_doc_citations', 'df_doc_authors_citations'])

### df_doc_info

In [10]:
100*(1-dict_dfs['df_doc_info'].isnull().sum()/dict_dfs['df_doc_info'].shape[0])

grobid_version      100.000000
grobid_timestamp    100.000000
pdf_md5             100.000000
language_code       100.000000
acknowledgement      78.313253
abstract             97.074010
body                100.000000
annex                 0.000000
file                100.000000
status              100.000000
raw_data            100.000000
dtype: float64

In [29]:
df_doc_info = dict_dfs['df_doc_info'].loc[:,getColumnsWithData(dict_dfs['df_doc_info'])]

In [46]:
df_doc_info.head()

Unnamed: 0_level_0,grobid_version,grobid_timestamp,pdf_md5,language_code,acknowledgement,abstract,body,file,status,raw_data
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.7.0,2022-05-15 02:11:00,BDF9A1234112D0698F9E8851205A26D5,en,ACKNOWLEDGEMENTS This project has received fun...,,Even minute amounts of one solute atom per one...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
2,0.7.0,2022-05-15 02:11:00,2A43443002C7CEB96ABFA9F12E08F3E2,en,,Atomic column localization and segmentation in...,TEMImageNet training library and AtomSegNet de...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
3,0.7.0,2022-05-15 02:08:00,8AB5E185D2AF66AAAEE64646C3E11BCB,en,ACKNOWLEDGMENTS The authors would like to than...,The applications of machine learning technique...,I. INTRODUCTION Machine learning (ML) is used ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
4,0.7.0,2022-05-15 02:10:00,D6AC410DA39D39D9ADFC4E538B4A4D79,en,Acknowledgments The authors acknowledge Yuxing...,The global demand for data storage and process...,In response to the increasing demand for data ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."
5,0.7.0,2022-05-15 02:09:00,D49CF526DC981A67FBE621C9C01DF0A5,en,Acknowledgment This study was funded by the De...,Deep neural networks are machine learning tool...,Introduction Machine learning is a branch of a...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x..."


### df_doc_head

In [31]:
100*(1-dict_dfs['df_doc_head'].isnull().sum()/dict_dfs['df_doc_head'].shape[0])

index_head              0.000000
id_head                 0.000000
unstructured_head       0.000000
date_head              46.299484
title_head             98.278830
book_title_head         0.000000
series_title_head       0.000000
journal_head            0.000000
journal_abbrev_head     0.000000
publisher_head          0.000000
institution_head        0.000000
issn_head               0.688468
eissn_head              0.000000
volume_head             0.000000
issue_head              0.000000
pages_head              0.000000
first_page_head         0.000000
last_page_head          0.000000
note_head               0.000000
doi_head               97.418244
pmid_head               0.000000
pmcid_head              0.000000
arxiv_id_head           0.172117
ark_head                0.000000
istex_id_head           0.000000
url_head                0.000000
dtype: float64

In [32]:
dict_dfs['df_doc_head'].dtypes

index_head             float64
id_head                float64
unstructured_head      float64
date_head               object
title_head              object
book_title_head        float64
series_title_head      float64
journal_head           float64
journal_abbrev_head    float64
publisher_head         float64
institution_head       float64
issn_head               object
eissn_head             float64
volume_head            float64
issue_head             float64
pages_head             float64
first_page_head        float64
last_page_head         float64
note_head              float64
doi_head                object
pmid_head              float64
pmcid_head             float64
arxiv_id_head           object
ark_head               float64
istex_id_head          float64
url_head               float64
dtype: object

In [33]:
df_doc_head = dict_dfs['df_doc_head'].loc[:,getColumnsWithData(dict_dfs['df_doc_head'])]

In [34]:
df_doc_head

Unnamed: 0_level_0,date_head,title_head,issn_head,doi_head,arxiv_id_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,A machine learning approach to model solute gr...,,10.1038/s41524-018-0122-7,
2,,,,10.1038/s41598-021-84499-w,
3,2019-04-17,Atom-density representations for machine learning,,10.1063/1.5090481,
4,,,,10.1016/j.mattod.2020.07.016,
5,2020-04-22,Computational Materials Science,,10.1016/j.commatsci.2020.109687,
...,...,...,...,...,...
583,,Computational Materials Science,,10.1016/j.commatsci.2019.109099,
584,2021-02-28,SEM-Net: Deep features selections with Binary ...,,10.1016/j.mtcomm.2021.102198,
585,,Processing Optimization and Property Predictio...,,10.1002/adts.201900197,
586,,Computational Materials Science,,10.1016/j.commatsci.2019.01.044,


In [49]:
df_doc_info_head = df_doc_info.join(df_doc_head, how='left')

In [50]:
df_doc_info_head.head(5)

Unnamed: 0_level_0,grobid_version,grobid_timestamp,pdf_md5,language_code,acknowledgement,abstract,body,file,status,raw_data,date_head,title_head,issn_head,doi_head,arxiv_id_head
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.7.0,2022-05-15 02:11:00,BDF9A1234112D0698F9E8851205A26D5,en,ACKNOWLEDGEMENTS This project has received fun...,,Even minute amounts of one solute atom per one...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",,A machine learning approach to model solute gr...,,10.1038/s41524-018-0122-7,
2,0.7.0,2022-05-15 02:11:00,2A43443002C7CEB96ABFA9F12E08F3E2,en,,Atomic column localization and segmentation in...,TEMImageNet training library and AtomSegNet de...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",,,,10.1038/s41598-021-84499-w,
3,0.7.0,2022-05-15 02:08:00,8AB5E185D2AF66AAAEE64646C3E11BCB,en,ACKNOWLEDGMENTS The authors would like to than...,The applications of machine learning technique...,I. INTRODUCTION Machine learning (ML) is used ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2019-04-17,Atom-density representations for machine learning,,10.1063/1.5090481,
4,0.7.0,2022-05-15 02:10:00,D6AC410DA39D39D9ADFC4E538B4A4D79,en,Acknowledgments The authors acknowledge Yuxing...,The global demand for data storage and process...,In response to the increasing demand for data ...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",,,,10.1016/j.mattod.2020.07.016,
5,0.7.0,2022-05-15 02:09:00,D49CF526DC981A67FBE621C9C01DF0A5,en,Acknowledgment This study was funded by the De...,Deep neural networks are machine learning tool...,Introduction Machine learning is a branch of a...,C:\Users\vierb\OneDrive\Área de Trabalho\Proje...,status 200,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<TEI x...",2020-04-22,Computational Materials Science,,10.1016/j.commatsci.2020.109687,


In [52]:
df_doc_info_head.date_head = df_doc_info_head.date_head.apply(lambda e: pd.to_datetime(e))

In [59]:
df_doc_info_head['year'] = df_doc_info_head.date_head.apply(lambda e: e if pd.isna(e) else int(e.year))

In [53]:
df_doc_info_head.date_head.value_counts(dropna=False)

NaT           312
2021-08-02      4
2021-03-10      3
2021-04-23      3
2021-03-02      3
             ... 
2012-01-18      1
2012-04-21      1
2013-08-24      1
2020-07-17      1
2021-02-28      1
Name: date_head, Length: 240, dtype: int64

In [65]:
df_doc_info_head.year.value_counts(dropna=False).to_dict()

{NaT: 312,
 2021: 109,
 2020: 45,
 2019: 25,
 2011: 13,
 2012: 12,
 2008: 11,
 2010: 10,
 2009: 9,
 2015: 8,
 2018: 7,
 2013: 5,
 2017: 5,
 2016: 4,
 2014: 2,
 2007: 1,
 2000: 1,
 2005: 1,
 2006: 1}

In [64]:
pd.DataFrame(df_doc_info_head.year.value_counts(dropna=False).to_dict(), columns=['year','count'])

Unnamed: 0,year,count


In [87]:
import plotly.express as px

df_doc_info_head.year = df_doc_info_head.year.fillna('Null Value')

fig = px.pie(df_doc_info_head, 
             values='year', 
             names='year',
             title='Number of Articles by Year',
             hover_data=['year'], 
             labels={'values':'Percentage','year':'Year of Article'}, hole=.5)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

### df_doc_authors

In [17]:
100*(1-dict_dfs['df_doc_authors'].isnull().sum()/dict_dfs['df_doc_authors'].shape[0])

full_name_author      99.701715
given_name_author     98.322148
middle_name_author    24.086503
surname_author        99.701715
email_author          21.439224
orcid_author           3.094705
institution_author    82.662192
department_author     72.073080
laboratory_author     13.534676
addr_line_author      20.208799
post_code_author      66.853095
settlement_author     78.225205
country_author        85.160328
dtype: float64

In [92]:
df_doc_authors = dict_dfs['df_doc_authors'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors'])]

In [93]:
df_doc_authors.head()

Unnamed: 0_level_0,full_name_author,given_name_author,middle_name_author,surname_author,email_author,orcid_author,institution_author,department_author,laboratory_author,addr_line_author,post_code_author,settlement_author,country_author
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Liam Huber,Liam,,Huber,huber@mpie.de,,Max-Planck-Institut für Eisenforschung GmbH,,,,D-40237,Düsseldorf,Germany
1,Raheleh Hadian,Raheleh,,Hadian,,,Max-Planck-Institut für Eisenforschung GmbH,,,,D-40237,Düsseldorf,Germany
1,Blazej Grabowski,Blazej,,Grabowski,,,Max-Planck-Institut für Eisenforschung GmbH,,,,D-40237,Düsseldorf,Germany
1,Jörg Neugebauer,Jörg,,Neugebauer,,,Max-Planck-Institut für Eisenforschung GmbH,,,,D-40237,Düsseldorf,Germany
2,,,,,,,,,,,,,


In [163]:
columns_select = ['country_author','settlement_author']
df_sun_agg = df_doc_authors.groupby(by=columns_select, as_index=False, dropna=True)['full_name_author'].count()

In [164]:
df_sun_agg = df_sun_agg.fillna("")
df_sun_agg.rename(columns={'country_author':'Author Country',
                           'settlement_author':'Author Settlement',
                           'full_name_author':'Number of Authors'},
                  inplace=True)

In [165]:
df_sun_agg.head()

Unnamed: 0,Author Country,Author Settlement,Number of Authors
0,Algeria,Tlemcen,2
1,Argentina,Bahía Blanca,4
2,Argentina,Buenos Aires,5
3,Argentina,La Plata,1
4,Argentina,Salta,1


In [166]:
import plotly.express as px
fig = px.sunburst(df_sun_agg, 
                  path=['Author Country',
                        'Author Settlement'],
                  values='Number of Authors')
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [90]:
df_doc_authors.full_name_author.value_counts(dropna=False)

A R T I C L E I N F O    10
Acta Materialia           8
NaN                       8
Yu Sun                    6
Weidong Zeng              6
                         ..
Zhong Xin Yu              1
Hieu-Chi Dam              1
Van-Doan Nguyen           1
Tim Mueller               1
Devinder Kumar            1
Name: full_name_author, Length: 2324, dtype: int64

In [98]:
list_delete_authors = ['A R T I C L E I N F O', np.nan, 'Null', 'NaN','nan', 'null', '', ' ']
filter_delete_authors = ~(df_doc_authors.full_name_author.isin(list_delete_authors))
df_doc_authors = df_doc_authors.loc[filter_delete_authors].copy()

In [103]:
top_authors = df_doc_authors.full_name_author.value_counts()
df_top_authors = pd.DataFrame({'Full Name':top_authors.index,
                               'Number of Articles':top_authors.values.tolist()})

df_top_authors

Unnamed: 0,Full Name,Number of Articles
0,Acta Materialia,8
1,Dane Morgan,6
2,Yu Sun,6
3,N S Reddy,6
4,Weidong Zeng,6
...,...,...
2317,Zhong Xin Yu,1
2318,Hieu-Chi Dam,1
2319,Van-Doan Nguyen,1
2320,Tim Mueller,1


In [128]:
import plotly.express as px

top_authors = df_top_authors.nlargest(20,'Number of Articles')
top_authors = top_authors.sort_values('Number of Articles',ascending=True)
fig = px.bar(top_authors,
             y='Full Name',
             x='Number of Articles',
             color='Number of Articles',
             width=600,
             height=700,text='Number of Articles')
fig.update(layout_coloraxis_showscale=False)
# fig.update_traces(showlegend=False)
# fig.update_traces(marker_showscale=False)
fig.update_xaxes(visible=False)
fig.update_layout(yaxis_title=None, xaxis_title=None)
fig.show()

### df_doc_authors_citations

In [41]:
100*(1-dict_dfs['df_doc_authors_citations'].isnull().sum()/dict_dfs['df_doc_authors_citations'].shape[0])

id                      100.000000
index                   100.000000
full_name_citation       99.371668
given_name_citation      98.339759
middle_name_citation     30.893282
surname_citation         98.659667
email_citation            0.000000
orcid_citation            0.000000
institution_citation      0.000000
department_citation       0.000000
laboratory_citation       0.000000
addr_line_citation        0.000000
post_code_citation        0.000000
settlement_citation       0.000000
country_citation          0.000000
dtype: float64

In [42]:
df_doc_authors_citations = dict_dfs['df_doc_authors_citations'].loc[:,getColumnsWithData(dict_dfs['df_doc_authors_citations'])]

In [43]:
df_doc_authors_citations

Unnamed: 0_level_0,id,index,full_name_citation,given_name_citation,middle_name_citation,surname_citation
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,b0,0,E Hall,E,,Hall
1,b1,1,R Kirchheim,R,,Kirchheim
1,b2,2,C Koch,C,,Koch
1,b2,2,R Scattergood,R,,Scattergood
1,b2,2,K Darling,K,,Darling
...,...,...,...,...,...,...
587,b65,65,I Sutskever,I,,Sutskever
587,b65,65,R Salakhutdinov,R,,Salakhutdinov
587,b66,66,D E Rumelhart,D,E,Rumelhart
587,b66,66,G E Hinton,G,E,Hinton


In [44]:
df_doc_authors_citations.full_name_citation.value_counts(dropna=False)

NaN            766
J Behler       374
Y Zhang        339
R Ramprasad    318
G Ceder        277
              ... 
A Cadeddu        1
M Randić         1
D Meissner       1
D Wöhrle         1
F R N Maia       1
Name: full_name_citation, Length: 46790, dtype: int64

In [169]:
path_geo = os.path.join(path,'data','external')

In [206]:
shapes_geometry = pd.read_csv(os.path.join(path_geo,'shapes_geometry.csv'), sep=';', decimal='.')
shapes_correct = pd.read_csv(os.path.join(path_geo,'shapes_correct.csv'), encoding='latin-1',sep=';', decimal='.')

In [207]:
df_country_agg = df_doc_authors.groupby(by=['country_author'], as_index=False, dropna=True)['full_name_author'].count()

dictCorrectShapes = {e[0]:e[1] for e in zip(shapes_correct.convert,shapes_correct.name)}

df_country_agg.country_author = df_country_agg.country_author.apply(lambda e: dictCorrectShapes.get(e,e))

In [208]:
df_country_agg.head()

Unnamed: 0,country_author,full_name_author
0,Algeria,6
1,Argentina,14
2,Australia,47
3,Austria,14
4,Bangladesh,3


In [176]:
shapes_geometry.head()

Unnamed: 0,continent,name,iso_a3,geometry
0,Oceania,Fiji,FJI,"MULTIPOLYGON (((180 -16.067132663642447, 180 -..."
1,Africa,Tanzania,TZA,POLYGON ((33.90371119710453 -0.950000000000000...
2,Africa,W. Sahara,ESH,POLYGON ((-8.665589565454809 27.65642588959235...
3,North America,Canada,CAN,MULTIPOLYGON (((-122.84000000000003 49.0000000...
4,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...


In [246]:
df_geo_plot = shapes_geometry.merge(df_country_agg, left_on='name', right_on='country_author')

In [247]:
df_geo_plot.head()

Unnamed: 0,continent,name,iso_a3,geometry,country_author,full_name_author
0,North America,Canada,CAN,MULTIPOLYGON (((-122.84000000000003 49.0000000...,Canada,30
1,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,489
2,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,159
3,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,18
4,South America,Argentina,ARG,MULTIPOLYGON (((-68.63401022758323 -52.6363704...,Argentina,14


In [248]:
df_geo_plot.full_name_author = df_geo_plot.full_name_author.fillna(0)

In [249]:
df_geo_plot = df_geo_plot.reset_index().set_index('index')
df_geo_plot.geometry.to_json()


Unnamed: 0_level_0,continent,name,iso_a3,geometry,country_author,full_name_author
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,North America,Canada,CAN,MULTIPOLYGON (((-122.84000000000003 49.0000000...,Canada,30
1,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,489
2,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,159
3,North America,United States of America,USA,MULTIPOLYGON (((-122.84000000000003 49.0000000...,United States of America,18
4,South America,Argentina,ARG,MULTIPOLYGON (((-68.63401022758323 -52.6363704...,Argentina,14
...,...,...,...,...,...,...
62,Europe,Slovenia,SVN,POLYGON ((13.806475457421527 46.50930613869121...,Slovenia,3
63,Europe,Finland,FIN,POLYGON ((28.591929559043194 69.06477692328666...,Finland,15
64,Asia,Japan,JPN,MULTIPOLYGON (((141.88460086483497 39.18086456...,Japan,68
65,Asia,Saudi Arabia,SAU,POLYGON ((34.95603722508426 29.356554673778845...,Saudi Arabia,3


In [224]:
df_geo_plot.rename(columns={'full_name_author':'Number of Authors','name':'Country'}, inplace=True)

In [239]:
df_geo_plot.geometry.iat[0]

'MULTIPOLYGON (((-122.84000000000003 49.000000000000114, -122.97421000000001 49.00253777777778, -124.91024 49.98456, -125.62461 50.416560000000004, -127.43561000000001 50.83061, -127.99276 51.71583, -127.85032 52.32961, -129.12979 52.75538, -129.30523 53.561589999999995, -130.51497 54.28757, -130.53610895273684 54.80275447679924, -130.53611 54.802780000000006, -129.98 55.285000000000004, -130.00778000000003 55.915830000000085, -131.70781 56.55212, -132.73042 57.692890000000006, -133.35556000000003 58.41028000000001, -134.27111000000002 58.86111000000005, -134.94500000000005 59.2705600000001, -135.47583 59.787780000000005, -136.47972000000004 59.46389000000005, -137.4525 58.905, -138.34089 59.562110000000004, -139.03900000000002 60, -140.013 60.27682000000001, -140.99778 60.30639000000001, -140.9925 66.00003000000001, -140.986 69.712, -140.98598761037601 69.71199839952635, -139.12052 69.47102, -137.54636000000002 68.99002, -136.50358 68.89804, -135.62576 69.31512000000001, -134.41464000

In [255]:
import folium

my_map = folium.Map()
# Add the data
folium.Choropleth(
    geo_data=pd.read_json(os.path.join(path_geo,'geometry.json')).to_json(),
    name='choropleth',
    data=df_geo_plot,
    columns=['name'],
    key_on='feature.properties.id',
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Authors by Location'
).add_to(my_map)
my_map.save('meat.html')

IndexError: list index out of range

In [256]:
!pip install geopandas

Collecting geopandas
  Using cached geopandas-0.11.1-py3-none-any.whl (1.0 MB)
Collecting shapely<2,>=1.7
  Using cached Shapely-1.8.2-cp38-cp38-win_amd64.whl (1.3 MB)
Collecting fiona>=1.8
  Using cached Fiona-1.8.21.tar.gz (1.0 MB)


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\vierb\anaconda3\python.exe' -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\vierb\\AppData\\Local\\Temp\\pip-install-653l1w9b\\fiona\\setup.py'"'"'; __file__='"'"'C:\\Users\\vierb\\AppData\\Local\\Temp\\pip-install-653l1w9b\\fiona\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\vierb\AppData\Local\Temp\pip-pip-egg-info-rbzdyvus'
         cwd: C:\Users\vierb\AppData\Local\Temp\pip-install-653l1w9b\fiona\
    Complete output (1 lines):
    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
ERROR: Command errored out with exit status 1: python setup.py egg_info Check the logs for full command out

https://stackoverflow.com/questions/43587960/gdal-installation-error-using-pip

In [262]:
!pip install fiona

Collecting fiona
  Using cached Fiona-1.8.21.tar.gz (1.0 MB)


    ERROR: Command errored out with exit status 1:
     command: 'C:\Users\vierb\anaconda3\python.exe' -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\vierb\\AppData\\Local\\Temp\\pip-install-c8m3jwnd\\fiona\\setup.py'"'"'; __file__='"'"'C:\\Users\\vierb\\AppData\\Local\\Temp\\pip-install-c8m3jwnd\\fiona\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\vierb\AppData\Local\Temp\pip-pip-egg-info-weqxohpt'
         cwd: C:\Users\vierb\AppData\Local\Temp\pip-install-c8m3jwnd\fiona\
    Complete output (1 lines):
    A GDAL API version must be specified. Provide a path to gdal-config using a GDAL_CONFIG environment variable or use a GDAL_VERSION environment variable.
    ----------------------------------------
ERROR: Command errored out with exit status 1: python setup.py egg_info Check the logs for full command out

In [None]:
pip install wheel
pip install pipwin

pipwin install numpy
pipwin install pandas
pipwin install shapely
pipwin install gdal
pipwin install fiona
pipwin install pyproj
pipwin install six
pipwin install rtree
pipwin install geopandas

In [2]:
pip install r"C:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\notebooks\GDAL-3.4.3-pp38-pypy38_pp73-win_amd64.whl"

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an EnvironmentError: Bad path: c:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\notebooks\rC:\Users\vierb\OneDrive\Área de Trabalho\Projetos\PGC\notebooks\GDAL-3.4.3-pp38-pypy38_pp73-win_amd64.whl



In [3]:
!conda install -c conda-forge gdal