In [1]:
import pandas as pd
import plotly.express as px
import IPython.display as ipd
import spacy # to extract country


In [2]:
# importing data

article_data = pd.read_csv("articles.leptospirosis.csv")
author_data = pd.read_csv("authors.leptospirosis.csv")
paper_counts = pd.read_csv("paper_counts.csv")

In [3]:
# add combined name column
# Adding FullName column to article_data as well as to author_data

article_data['FullName'] = article_data['FirstAuthorInitials']+' '+ article_data['FirstAuthorForename'] +' '+ article_data ['FirstAuthorLastname'] 

author_data['FullName']= author_data['AuthorInitials']+ ' '+ author_data['AuthorForename']+ ' '+ author_data['AuthorLastname']



### Top contributors list

In [4]:
# taking from author_data

count = author_data.groupby(['FullName', 'AuthorInitials', 'AuthorForename', 'AuthorLastname']).agg('count')['PMID'].reset_index()

top_contributors = count.sort_values('PMID', ascending = False)
top_contributors


Unnamed: 0,FullName,AuthorInitials,AuthorForename,AuthorLastname,PMID
1017,AI Albert I Ko,AI,Albert I,Ko,72
8051,M Mathieu Picardeau,M,Mathieu,Picardeau,70
13475,W Walter Lilenbaum,W,Walter,Lilenbaum,56
9420,N Nobuo Koizumi,N,Nobuo,Koizumi,53
8671,MG Mitermayer G Reis,MG,Mitermayer G,Reis,51
...,...,...,...,...,...
5233,IA Ian A Gardner,IA,Ian A,Gardner,1
5234,IA Igor A H F Schabib Péres,IA,Igor A H F,Schabib Péres,1
5235,IA In-Ae Chang,IA,In-Ae,Chang,1
5236,IA Iwan A Burgener,IA,Iwan A,Burgener,1


In [5]:
# removing empty Affiliations cells from author_data

author_data_copy = author_data.copy()
author_data_all_affl = author_data_copy.dropna(subset='AuthorAffiliation')
author_data_all_affl


Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,FullName
0,10548299,1,E,Daher,E,"Departamento de Medicina Clínica, Faculdade de...",E E Daher
4,10569777,1,D A,Haake,DA,"Division of Infectious Diseases, Veterans Affa...",DA D A Haake
11,10585813,1,P C,Marotto,PC,"Intensive Care Unit, Instituto de Infectologia...",PC P C Marotto
18,10586903,1,P,Cumberland,P,"Infectious Disease Epidemiology Unit, London S...",P P Cumberland
21,10596270,1,A,Steger-Lieb,A,"Klinik für kleine Haustiere, Universität Bern.",A A Steger-Lieb
...,...,...,...,...,...,...,...
21326,38081475,7,Catherine,Werts,C,"Institut Pasteur, Université Paris Cité, CNRS ...",C Catherine Werts
21327,38087323,1,Noraini,Philip,N,"School of Biological Sciences, Universiti Sain...",N Noraini Philip
21328,38087323,2,Kamruddin,Ahmed,K,"Department of Pathology and Microbiology, Facu...",K Kamruddin Ahmed
21329,38094659,1,Guan-Sheng,Li,GS,"Department of Critical Medicine, Daomenkou, Ch...",GS Guan-Sheng Li


In [6]:
# top 50 contributors

top_50 = top_contributors.head(50)
top_50_merged = top_50.merge(author_data_all_affl, on = 'FullName', how = 'left')
top_50_merged

Unnamed: 0,FullName,AuthorInitials_x,AuthorForename_x,AuthorLastname_x,PMID_x,PMID_y,AuthorN,AuthorForename_y,AuthorLastname_y,AuthorInitials_y,AuthorAffiliation
0,AI Albert I Ko,AI,Albert I,Ko,72,19679685,7,Albert I,Ko,AI,"Division of Infectious Diseases, Weill Medical..."
1,AI Albert I Ko,AI,Albert I,Ko,72,19756012,1,Albert I,Ko,AI,"Division of Infectious Disease, Weill Medical ..."
2,AI Albert I Ko,AI,Albert I,Ko,72,24743322,4,Albert I,Ko,AI,"Yale School of Public Health, New Haven, Conne..."
3,AI Albert I Ko,AI,Albert I,Ko,72,24875389,17,Albert I,Ko,AI,"Centro de Pesquisas Gonçalo Moniz, Fundação Os..."
4,AI Albert I Ko,AI,Albert I,Ko,72,25058149,7,Albert I,Ko,AI,Yale University Schools of Public Health and M...
...,...,...,...,...,...,...,...,...,...,...,...
802,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,20558583,3,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."
803,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,24928214,3,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."
804,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,25627443,4,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."
805,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,25834144,2,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."


In [7]:
# function to extract countries from sentenses

nlp = spacy.load("en_core_web_sm")

def identify_countries(sentence):
    # Convert to string, handling NaN values
    sentence = str(sentence)
    doc = nlp(sentence)
    
    # Extract entities that are recognized as GPE (Geopolitical Entity)
    countries = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    
    # Return the last country if the list is not empty, otherwise return an empty string
    return countries[-1] if countries else ''

In [8]:
# adding country column

top_50_merged_copy = top_50_merged.copy()
for i in range(len(top_50_merged_copy['FullName'])):
    countries_in_affl = identify_countries(top_50_merged_copy.loc[i, 'AuthorAffiliation'])

    top_50_merged_copy.loc[i, 'Country'] = countries_in_affl

# Display the updated DataFrame
top_50_merged_copy

Unnamed: 0,FullName,AuthorInitials_x,AuthorForename_x,AuthorLastname_x,PMID_x,PMID_y,AuthorN,AuthorForename_y,AuthorLastname_y,AuthorInitials_y,AuthorAffiliation,Country
0,AI Albert I Ko,AI,Albert I,Ko,72,19679685,7,Albert I,Ko,AI,"Division of Infectious Diseases, Weill Medical...",USA
1,AI Albert I Ko,AI,Albert I,Ko,72,19756012,1,Albert I,Ko,AI,"Division of Infectious Disease, Weill Medical ...",USA
2,AI Albert I Ko,AI,Albert I,Ko,72,24743322,4,Albert I,Ko,AI,"Yale School of Public Health, New Haven, Conne...",Brazil
3,AI Albert I Ko,AI,Albert I,Ko,72,24875389,17,Albert I,Ko,AI,"Centro de Pesquisas Gonçalo Moniz, Fundação Os...",United States of America
4,AI Albert I Ko,AI,Albert I,Ko,72,25058149,7,Albert I,Ko,AI,Yale University Schools of Public Health and M...,United States of America
...,...,...,...,...,...,...,...,...,...,...,...,...
802,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,20558583,3,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa...",Brazil
803,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,24928214,3,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa...",Brazil
804,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,25627443,4,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa...",Brazil
805,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,25834144,2,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa...",Brazil


In [9]:
contributions_country_copy = top_50_merged_copy.copy()

contributions_country = contributions_country_copy[['FullName', 'AuthorInitials_x', 'AuthorForename_x', 'AuthorLastname_x', 'PMID_x', 'Country']].drop_duplicates(subset= 'FullName', keep='first')

contributions_country

Unnamed: 0,FullName,AuthorInitials_x,AuthorForename_x,AuthorLastname_x,PMID_x,Country
0,AI Albert I Ko,AI,Albert I,Ko,72,USA
38,M Mathieu Picardeau,M,Mathieu,Picardeau,70,France
85,W Walter Lilenbaum,W,Walter,Lilenbaum,56,Brazil
132,N Nobuo Koizumi,N,Nobuo,Koizumi,53,Japan
169,MG Mitermayer G Reis,MG,Mitermayer G,Reis,51,Brazil
195,F Federico Costa,F,Federico,Costa,38,Brazil
230,C Cyrille Goarant,C,Cyrille,Goarant,37,New Caledonia
257,RA Rudy A Hartskeerl,RA,Rudy A,Hartskeerl,37,The Netherlands
271,JM Joseph M Vinetz,JM,Joseph M,Vinetz,37,Peru
284,J Jie Yan,J,Jie,Yan,35,People's Republic of China


In [10]:
# manually
# EC Eliete C Romero: Brazil
# MB Marcos Bryan Heinemann : Brazil
contributions_country.loc[477, 'Country'] = 'Brazil'
contributions_country.loc[715, 'Country'] = 'Brazil'

contributions_country

Unnamed: 0,FullName,AuthorInitials_x,AuthorForename_x,AuthorLastname_x,PMID_x,Country
0,AI Albert I Ko,AI,Albert I,Ko,72,USA
38,M Mathieu Picardeau,M,Mathieu,Picardeau,70,France
85,W Walter Lilenbaum,W,Walter,Lilenbaum,56,Brazil
132,N Nobuo Koizumi,N,Nobuo,Koizumi,53,Japan
169,MG Mitermayer G Reis,MG,Mitermayer G,Reis,51,Brazil
195,F Federico Costa,F,Federico,Costa,38,Brazil
230,C Cyrille Goarant,C,Cyrille,Goarant,37,New Caledonia
257,RA Rudy A Hartskeerl,RA,Rudy A,Hartskeerl,37,The Netherlands
271,JM Joseph M Vinetz,JM,Joseph M,Vinetz,37,Peru
284,J Jie Yan,J,Jie,Yan,35,People's Republic of China


In [16]:
# plotting graph

plot = px.bar(
    contributions_country.head(30),
    x = 'AuthorForename_x',
    y = 'PMID_x',
    color = 'Country',
    title = 'Top Contributers from different Countries'
)
plot.update_xaxes(title_text = "Author's Forename")
plot.update_yaxes(title_text = 'Article Count')
plot.show()

In [18]:
top_contributors_country_copy = contributions_country.copy
top_contributors_country_copy.rename(columns = {'AuthorInitials_x':'AuthorInitials', 
                                                'AuthorForename_x':'AuthorForename',
                                               ''})
ipd.display(contributions_country)

Unnamed: 0,FullName,AuthorInitials_x,AuthorForename_x,AuthorLastname_x,PMID_x,Country
0,AI Albert I Ko,AI,Albert I,Ko,72,USA
38,M Mathieu Picardeau,M,Mathieu,Picardeau,70,France
85,W Walter Lilenbaum,W,Walter,Lilenbaum,56,Brazil
132,N Nobuo Koizumi,N,Nobuo,Koizumi,53,Japan
169,MG Mitermayer G Reis,MG,Mitermayer G,Reis,51,Brazil
195,F Federico Costa,F,Federico,Costa,38,Brazil
230,C Cyrille Goarant,C,Cyrille,Goarant,37,New Caledonia
257,RA Rudy A Hartskeerl,RA,Rudy A,Hartskeerl,37,The Netherlands
271,JM Joseph M Vinetz,JM,Joseph M,Vinetz,37,Peru
284,J Jie Yan,J,Jie,Yan,35,People's Republic of China
