In [1]:
!pip install pandas bokeh seaborn pybliometrics

Collecting pybliometrics
  Downloading pybliometrics-3.6-py2.py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m748.6 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: pybliometrics
Successfully installed pybliometrics-3.6


In [3]:
%matplotlib notebook
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pybliometrics.scopus import ScopusSearch
from tqdm import tqdm

Creating config file at /root/.config/pybliometrics.cfg with default paths...
Please enter your API Key(s), obtained from http://dev.elsevier.com/myapikey.html.  Separate multiple keys by comma:
44d9fdfb846449e4a2bab214f317753e
API Keys are sufficient for most users.  If you have an InstToken, please enter the token now; otherwise just press Enter:

Configuration file successfully created at /root/.config/pybliometrics.cfg
For details see https://pybliometrics.rtfd.io/en/stable/configuration.html.


In [4]:
def get_search_result_df(conference, year, volume=None):
    if volume is not None:
      full_volume_results = None
      for vol in volume:
        search_string = f'SRCTITLE (\"{conference}\") AND VOLUME ({vol}) AND PUBYEAR = {year}'
        search_results = ScopusSearch(search_string, subscriber=False)
        results_df = pd.DataFrame(pd.DataFrame(search_results.results))

        if full_volume_results is None:
          full_volume_results = results_df
        else:
          full_volume_results = pd.concat([full_volume_results, results_df], ignore_index=True)

      return(full_volume_results.shape, full_volume_results)

    else:
      search_string = f'SRCTITLE (\"{conference}\") AND PUBYEAR = {year}'

      search_results = ScopusSearch(search_string, subscriber=False)
      results_df = pd.DataFrame(pd.DataFrame(search_results.results))
      return(results_df.shape, results_df)

In [5]:
conferences_list = ["International Conference on Learning Representations",
                    "Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition",
                    "Proceedings of the IEEE International Conference on Computer Vision",
                    "European Conference on Computer Vision",
                    "International Journal of Computer Vision",
                    "IEEE Transactions on Pattern Analysis and Machine Intelligence",
                    "International Conference on Medical Image Computing and Computer Assisted Intervention",
                    "Advances in Neural Information Processing Systems",
                    "International Conference on Machine Learning"]

# Confusing Venues: Proceedings - International Conference on Machine Learning and Cybernetics, Proceedings - 2015 IEEE 14th International Conference on Machine Learning and Applications, Proceedings - 2020 2nd International Conference on Machine Learning, Big Data and Business Intelligence
# 6th International Conference on Learning Representations, ICLR 2018 - Workshop Track Proceedings

conf_journal_dict = {"European Conference on Computer Vision": "Lecture Notes in Computer Science",
                     "International Conference on Medical Image Computing and Computer Assisted Intervention": "Lecture Notes in Computer Science"}

conf_year_replacement = {('Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition',2017):
                            'IEEE Conference on Computer Vision and Pattern Recognition',
                          ('Proceedings of the IEEE International Conference on Computer Vision',2012): None,
                          ('Proceedings of the IEEE International Conference on Computer Vision',2014): None,
                          ('Proceedings of the IEEE International Conference on Computer Vision',2016): None,
                          ('Proceedings of the IEEE International Conference on Computer Vision',2018): None,
                          ('Proceedings of the IEEE International Conference on Computer Vision',2020): None,
                          ('Proceedings of the IEEE International Conference on Computer Vision',2022): None,
                          ('European Conference on Computer Vision',2013): None,
                          ('European Conference on Computer Vision',2015): None,
                          ('European Conference on Computer Vision',2017): None,
                          ('European Conference on Computer Vision',2019): None,
                          ('European Conference on Computer Vision',2021): None,
                        }
volume_dict = {
    "European Conference on Computer Vision": {2012: range(7572, 7579), 2014: range(8689, 8696), 2016: range(9905, 9913),
                                               2018: range(11205, 11221), 2020: range(12346, 12376), 2022: range(13661, 13700)},

    "International Conference on Medical Image Computing and Computer Assisted Intervention": {2012: range(7510, 7512), 2013: range(8149, 8151),
                                                                                               2014: (8673, 8675), 2015: range(9349, 9351),
                                                                                               2016: range(9900, 9902), 2017: range(10433, 10435),
                                                                                               2018: range(11070, 11073), 2019: range(11764, 11769),
                                                                                               2020: range(12261, 12267), 2021: range(12901, 12908 ),
                                                                                               2022: range(13431, 13438)}
}

full_results = None

for conf in tqdm(conferences_list):
  for year in range(2012, 2023):

    # If Conference and Year combination either doesnt exist or is a special case (CVPR 2017)
    if (conf,year) in conf_year_replacement:
        conf_temp = conf_year_replacement[(conf,year)]
        if conf_temp is None:
          continue
        shape, results = get_search_result_df(conf_temp, year)
    else:
        # If conference is instead published as a journal with certain volumes
        if conf in conf_journal_dict:
          volume = volume_dict[conf][year]
          conf_temp = conf_journal_dict[conf]
          shape, results = get_search_result_df(conf_temp, year, volume)
        else:
          # Normal conference year combination
          shape, results = get_search_result_df(conf, year)

    print(conf, ' ', year, ' : ', shape)
    if full_results is None:
      full_results = results
    else:
      full_results = pd.concat([full_results, results], ignore_index=True)

print(full_results.shape)

full_results.to_csv(f"top_tier_data.csv", index=False)

  0%|          | 0/9 [00:00<?, ?it/s]

International Conference on Learning Representations   2012  :  (0, 0)
International Conference on Learning Representations   2013  :  (57, 36)
International Conference on Learning Representations   2014  :  (74, 36)
International Conference on Learning Representations   2015  :  (106, 36)
International Conference on Learning Representations   2016  :  (80, 36)
International Conference on Learning Representations   2017  :  (312, 36)
International Conference on Learning Representations   2018  :  (535, 36)
International Conference on Learning Representations   2019  :  (503, 36)
International Conference on Learning Representations   2020  :  (687, 36)




International Conference on Learning Representations   2021  :  (861, 36)


 11%|█         | 1/9 [02:32<20:16, 152.10s/it]

International Conference on Learning Representations   2022  :  (1192, 36)
Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition   2012  :  (467, 36)
Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition   2013  :  (473, 36)
Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition   2014  :  (541, 36)
Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition   2015  :  (603, 36)
Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition   2016  :  (645, 36)


 11%|█         | 1/9 [03:41<29:34, 221.77s/it]


KeyboardInterrupt: 

In [None]:
import pandas as pd

full_results = pd.read_csv("/content/drive/MyDrive/Ro'ya CV4Africa Community Files/Bibliometric Study/Work_Phases/Data_Collection/top_tier_data_v2.csv")

venues_names_as_is = ["Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition",
                      "Proceedings of the IEEE International Conference on Computer Vision",
                      "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
                      "International Journal of Computer Vision",
                      "IEEE Transactions on Pattern Analysis and Machine Intelligence",
                      "Advances in Neural Information Processing Systems",
                      "International Conference on Machine Learning",
                      "International Conference on Learning Representations"]

venues_names_as_is = [venue.lower() for venue in venues_names_as_is]

wrong_venues = []
for df in full_results["publicationName"]:
  if df.lower() not in venues_names_as_is:
    wrong_venues .append(df)

wrong_venues = set(wrong_venues)
print('Wrong venues #', len(wrong_venues))


identified_wrong_venues_keywords = ["ICMLA", "MLBDBI", "MLCCIM", "ICMLANT", "MLKE", "Workshop Track Proceedings", "iCMLDE", "MLDS", "Cybernetics", "MLISE",
                                    "Application", "COM-IT-CON", "MLCR", "Cloud"]

del_indices = []
for index, row in full_results.iterrows():
  venue = row["publicationName"]
  for keyword in identified_wrong_venues_keywords:
      if keyword in venue:
        del_indices.append(index)

full_results = full_results.drop(del_indices)
full_results.to_csv("/content/drive/MyDrive/Ro'ya CV4Africa Community Files/Bibliometric Study/Work_Phases/Data_Collection/correct_top_tier_data_v2.csv")

  full_results = pd.read_csv("/content/drive/MyDrive/Ro'ya CV4Africa Community Files/Bibliometric Study/Work_Phases/Data_Collection/top_tier_data.csv")


Wrong venues # 51


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas bokeh seaborn pybliometrics==3.5.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install pbr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pbr
  Using cached pbr-5.11.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: pbr
Successfully installed pbr-5.11.1


In [None]:
########### Authors of Topitier conferences List ########################

import pandas as pd
from tqdm import tqdm
import numpy as np
from pybliometrics.scopus import AbstractRetrieval
import json as simplejson

full_results = pd.read_csv("/content/drive/MyDrive/Ro'ya CV4Africa Community Files/Bibliometric Study/Work_Phases/Data_Collection/correct_top_tier_data_v2.csv")
err_count = 0

print('Amount of publications #', len(full_results))

start_index = 0
authors_df =[]
for index, row in tqdm(full_results.iterrows()):
  if index <= start_index:
    continue
  try:
    import pdb; pdb.set_trace()
    ab = AbstractRetrieval(row["eid"], view='FULL')
    if type(row["affiliation_country"]) != str:
      if np.isnan(row["affiliation_country"]):
        continue
    eid, dup_indx = [row["eid"]]*len(ab.authorgroup), [index]*len(ab.authorgroup)
    author_df = pd.DataFrame({'dup_indx':dup_indx, 'eid':eid})
    author_df = pd.concat([author_df, pd.DataFrame(data = ab.authorgroup)], axis=1 )
    authors_df.append(author_df)
  except:
    err_count += 1
    start_index = index
    print('Stopped at ', start_index)
    break

print('Errors #', err_count)
print(len(authors_df))
authors_df = pd.concat(authors_df, axis=0)
authors_df.to_csv("/content/drive/MyDrive/Ro'ya CV4Africa Community Files/Bibliometric Study/Work_Phases/Data_Collection/correct_toptier_authors_v2.csv")

  full_results = pd.read_csv("/content/drive/MyDrive/Ro'ya CV4Africa Community Files/Bibliometric Study/Work_Phases/Data_Collection/correct_top_tier_data.csv")


Amount of publications # 43855


1it [00:00,  8.67it/s]


Stopped at  1
Errors # 1
0


ValueError: ignored

In [None]:
from pybliometrics.scopus import AbstractRetrieval
ab = AbstractRetrieval("2-s2.0-84867132238", view='FULL')
print(ab.authorgroup)

[Author(affiliation_id=60028186, dptid=None, organization='Ecole Polytechnique Fédérale de Lausanne (EPFL)', city=None, postalcode=None, addresspart=None, country='Switzerland', collaboration=None, auid=34869135400, orcid=None, indexed_name='Alahi A.', surname='Alahi', given_name='Alexandre'), Author(affiliation_id=60028186, dptid=None, organization='Ecole Polytechnique Fédérale de Lausanne (EPFL)', city=None, postalcode=None, addresspart=None, country='Switzerland', collaboration=None, auid=55376821300, orcid=None, indexed_name='Ortiz R.', surname='Ortiz', given_name='Raphael'), Author(affiliation_id=60028186, dptid=None, organization='Ecole Polytechnique Fédérale de Lausanne (EPFL)', city=None, postalcode=None, addresspart=None, country='Switzerland', collaboration=None, auid=7004114381, orcid=None, indexed_name='Vandergheynst P.', surname='Vandergheynst', given_name='Pierre')]
