In [1]:
import pandas as pd
import plotly.express as px
import IPython.display as ipd
import spacy # to extract country


In [2]:
# importing data

article_data = pd.read_csv("articles.leptospirosis.csv")
author_data = pd.read_csv("authors.leptospirosis.csv")
paper_counts = pd.read_csv("paper_counts.csv")

In [3]:
# add combined name column
# Adding FullName column to article_data as well as to author_data

article_data['FullName'] = article_data['FirstAuthorInitials']+' '+ article_data['FirstAuthorForename'] +' '+ article_data ['FirstAuthorLastname'] 

author_data['FullName']= author_data['AuthorInitials']+ ' '+ author_data['AuthorForename']+ ' '+ author_data['AuthorLastname']



### Top contributors list

In [4]:
# taking from author_data

count = author_data.groupby(['FullName', 'AuthorInitials', 'AuthorForename', 'AuthorLastname']).agg('count')['PMID'].reset_index()

top_contributors = count.sort_values('PMID', ascending = False)
top_contributors


Unnamed: 0,FullName,AuthorInitials,AuthorForename,AuthorLastname,PMID
1017,AI Albert I Ko,AI,Albert I,Ko,72
8051,M Mathieu Picardeau,M,Mathieu,Picardeau,70
13475,W Walter Lilenbaum,W,Walter,Lilenbaum,56
9420,N Nobuo Koizumi,N,Nobuo,Koizumi,53
8671,MG Mitermayer G Reis,MG,Mitermayer G,Reis,51
...,...,...,...,...,...
5233,IA Ian A Gardner,IA,Ian A,Gardner,1
5234,IA Igor A H F Schabib Péres,IA,Igor A H F,Schabib Péres,1
5235,IA In-Ae Chang,IA,In-Ae,Chang,1
5236,IA Iwan A Burgener,IA,Iwan A,Burgener,1


In [5]:
# removing empty Affiliations cells from author_data

author_data_copy = author_data.copy()
author_data_all_affl = author_data_copy.dropna(subset='AuthorAffiliation')
author_data_all_affl


Unnamed: 0,PMID,AuthorN,AuthorForename,AuthorLastname,AuthorInitials,AuthorAffiliation,FullName
0,10548299,1,E,Daher,E,"Departamento de Medicina Clínica, Faculdade de...",E E Daher
4,10569777,1,D A,Haake,DA,"Division of Infectious Diseases, Veterans Affa...",DA D A Haake
11,10585813,1,P C,Marotto,PC,"Intensive Care Unit, Instituto de Infectologia...",PC P C Marotto
18,10586903,1,P,Cumberland,P,"Infectious Disease Epidemiology Unit, London S...",P P Cumberland
21,10596270,1,A,Steger-Lieb,A,"Klinik für kleine Haustiere, Universität Bern.",A A Steger-Lieb
...,...,...,...,...,...,...,...
21326,38081475,7,Catherine,Werts,C,"Institut Pasteur, Université Paris Cité, CNRS ...",C Catherine Werts
21327,38087323,1,Noraini,Philip,N,"School of Biological Sciences, Universiti Sain...",N Noraini Philip
21328,38087323,2,Kamruddin,Ahmed,K,"Department of Pathology and Microbiology, Facu...",K Kamruddin Ahmed
21329,38094659,1,Guan-Sheng,Li,GS,"Department of Critical Medicine, Daomenkou, Ch...",GS Guan-Sheng Li


In [6]:
# top 50 contributors

top_50 = top_contributors.head(50)
top_50_merged = top_50.merge(author_data_all_affl, on = 'FullName', how = 'left')
top_50_merged

Unnamed: 0,FullName,AuthorInitials_x,AuthorForename_x,AuthorLastname_x,PMID_x,PMID_y,AuthorN,AuthorForename_y,AuthorLastname_y,AuthorInitials_y,AuthorAffiliation
0,AI Albert I Ko,AI,Albert I,Ko,72,19679685,7,Albert I,Ko,AI,"Division of Infectious Diseases, Weill Medical..."
1,AI Albert I Ko,AI,Albert I,Ko,72,19756012,1,Albert I,Ko,AI,"Division of Infectious Disease, Weill Medical ..."
2,AI Albert I Ko,AI,Albert I,Ko,72,24743322,4,Albert I,Ko,AI,"Yale School of Public Health, New Haven, Conne..."
3,AI Albert I Ko,AI,Albert I,Ko,72,24875389,17,Albert I,Ko,AI,"Centro de Pesquisas Gonçalo Moniz, Fundação Os..."
4,AI Albert I Ko,AI,Albert I,Ko,72,25058149,7,Albert I,Ko,AI,Yale University Schools of Public Health and M...
...,...,...,...,...,...,...,...,...,...,...,...
802,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,20558583,3,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."
803,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,24928214,3,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."
804,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,25627443,4,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."
805,ZM Zenaide M de Morais,ZM,Zenaide M,de Morais,16,25834144,2,Zenaide M,de Morais,ZM,"Laboratório de Zoonoses Bacterianas do VPS, Fa..."


In [8]:
affl_data = top_50_merged[['FullName', 'AuthorInitials_x', 'AuthorForename_x', 'AuthorLastname_x', 'PMID_x', 'AuthorAffiliation' ]].copy()



In [13]:
affl_data_renamed = affl_data.rename(columns={'AuthorInitials_x': 'AuthorInitials',
                        'AuthorForename_x': 'AuthorForename',
                        'AuthorLastname_x': 'AuthorLastname',
                        'PMID_x': 'Article_Count'})

In [20]:
affl_data_renamed.head(50).to_csv('t50_affl.csv')

In [30]:

# Splitting affiliation data into multiple columns

# Creating a unique identifier for each row
affl_data_renamed['row_id'] = affl_data_renamed.groupby(['FullName', 'AuthorForename']).cumcount()

# Pivot the DataFrame
pivot_df = affl_data_renamed.pivot_table(index=['FullName', 'AuthorInitials', 'AuthorForename', 'AuthorLastname', 'Article_Count'],
                          columns='row_id', values='AuthorAffiliation', aggfunc='first').reset_index()

# Rename the columns
pivot_df.columns = [f'Affiliation_{col}' if col != ('', '') else col for col in pivot_df.columns]

# Display the updated DataFrame
pivot_df

affl_data_split = pivot_df.sort_values('Affiliation_Article_Count', ascending = False)

In [45]:
top_50_affl_data = pd.read_csv('top_50_affl.csv')
top_50_affl_data_table = top_50_affl_data[['Affiliation_FullName', 'Affiliation_Article_Count', 'Affiliation_Count']]

In [44]:
top_50_affl_data

Unnamed: 0.1,Unnamed: 0,Affiliation_FullName,Affiliation_AuthorInitials,Affiliation_AuthorForename,Affiliation_AuthorLastname,Affiliation_Article_Count,Affiliation_0,Affiliation_1,Affiliation_2,Affiliation_3,...,Affiliation_38,Affiliation_39,Affiliation_40,Affiliation_41,Affiliation_42,Affiliation_43,Affiliation_44,Affiliation_45,Affiliation_46,Affiliation_Count
0,1,AI Albert I Ko,AI,Albert I,Ko,72,"Division of Infectious Diseases, Weill Medical...","Division of Infectious Disease, Weill Medical ...","Yale School of Public Health, New Haven, Conne...","Centro de Pesquisas Gonçalo Moniz, Fundação Os...",...,,,,,,,,,,38
1,22,M Mathieu Picardeau,M,Mathieu,Picardeau,70,"Unité de Biologie des Spirochètes, Institut Pa...","Institut Pasteur, Unité de Biologie des Spiroc...","Institut Pasteur, Biology of Spirochetes Unit,...","Institut Pasteur, Unité de Biologie des Spiroc...",...,"Institut Pasteur, Unité Biologie des Spirochèt...","Institut Pasteur, Université de Paris, Biologi...","Unité de Biologie des Spirochètes, Department ...","Biology of Spirochetes Unit, Institut Pasteur,...","National Reference Center for Leptospirosis, B...","Pasteur International Unit, Integrative Microb...","Biology of Spirochetes Unit, French National R...","Biology of Spirochetes Unit, National Referenc...","Institut Pasteur, Université Paris Cité, Biolo...",47
2,44,W Walter Lilenbaum,W,Walter,Lilenbaum,56,"Veterinary Bacteriology Laboratory, Universida...","Veterinary Bacteriology Laboratory, Department...","Laboratory of Veterinary Bacteriology, Departm...","Laboratory of Veterinary Bacteriology, Departm...",...,"Fluminense Federal University, Laboratory of V...","Laboratory of Veterinary Bacteriology, Biomedi...","Laboratory of Veterinary Bacteriology, Departm...","Laboratory of Veterinary Bacteriology, Flumine...","Laboratory of Veterinary Bacteriology, Biomedi...","Laboratory of Veterinary Bacteriology, Biomedi...","Laboratory of Veterinary Bacteriology, Biomedi...","Laboratory of Veterinary Bacteriology, Biomedi...","Laboratory of Veterinary Bacteriology, Biomedi...",47
3,27,N Nobuo Koizumi,N,Nobuo,Koizumi,53,"Department of Bacteriology, National Institute...","Department of Bacteriology, National Institute...","Department of Bacteriology, National Institute...","Department of Bacteriology, National Institute...",...,,,,,,,,,,37
4,25,MG Mitermayer G Reis,MG,Mitermayer G,Reis,51,"Gonçalo Moniz Research Centre, Oswaldo Cruz Fo...","Federal University of Bahia, Salvador, Bahia, ...","Centro de Pesquisas Gonçalo Moniz, Fundação Os...","Centro de Pesquisas Gonçalo Moniz, Fundação Os...",...,,,,,,,,,,26
5,12,F Federico Costa,F,Federico,Costa,38,"Laboratório de Patologia e Biologia Molecular,...","Centro de Pesquisas Gonçalo Moniz, Fundação Os...","Centro de Pesquisas Gonçalo Moniz, Fundação Os...","Oswaldo Cruz Foundation, Brazilian Ministry of...",...,,,,,,,,,,35
6,34,RA Rudy A Hartskeerl,RA,Rudy A,Hartskeerl,37,"Royal Tropical Institute, KIT Biomedical Resea...","KIT Biomedical Research, WHO/FAO/OIE and Natio...",WHO/FAO/OIE and National Leptospirosis Referen...,WHO/FAO/OIE and National Leptospirosis Referen...,...,,,,,,,,,,14
7,6,C Cyrille Goarant,C,Cyrille,Goarant,37,"Institut Pasteur, Institut Pasteur Internation...","Institut Pasteur in New Caledonia, 9-11 avenue...","Institut Pasteur International Network, Leptos...","Institut Pasteur in New Caledonia, Institut Pa...",...,,,,,,,,,,27
8,19,JM Joseph M Vinetz,JM,Joseph M,Vinetz,37,"Division of Infectious Diseases, Department of...","Department of Community Medicine, Tropical Dis...","Division of Infectious Diseases, Department of...","Division of Infectious Diseases, Department of...",...,,,,,,,,,,13
9,17,J Jie Yan,J,Jie,Yan,35,Department of Medical Microbiology and Parasit...,Department of Medical Microbiology and Parasit...,Department of Medical Microbiology and Parasit...,Collaborative Innovation Center for Diagnosis ...,...,,,,,,,,,,12


In [49]:
top_50_affl_data_table.rename(columns = {'Affiliation_Article_Count':'Article_Count'}).head(20)

Unnamed: 0,Affiliation_FullName,Article_Count,Affiliation_Count
0,AI Albert I Ko,72,38
1,M Mathieu Picardeau,70,47
2,W Walter Lilenbaum,56,47
3,N Nobuo Koizumi,53,37
4,MG Mitermayer G Reis,51,26
5,F Federico Costa,38,35
6,RA Rudy A Hartskeerl,37,14
7,C Cyrille Goarant,37,27
8,JM Joseph M Vinetz,37,13
9,J Jie Yan,35,12
