# BiblioParsing demonstration tools

In [None]:
# Getting the working folder  
# as set in the json file "~\BiblioParsing\DemoConfig\BiblioParsing_config.json"

# Local library imports
import BiblioParsing as bp

# Getting the working folder name 
config_tup = bp.set_user_config()
working_folder_path = config_tup[0]

print(f"Working folder path: {working_folder_path}") 

print("\nCell run completed")

In [None]:
# Example of building the architecture of the working folder in the user's path home,
# for a corpus year "year" using `build_files_paths` function of the `DemoUtils` module,
# and getting useful paths for demo as described in the json file:
#   "~\BiblioParsing\DemoConfig\BiblioParsing_config.json".
# This function takes into account the list of databases which raw data should be parsed.
# The user is invited to put the raw data in the dedicated folder.

# Local library imports
import BiblioParsing as bp

# Setting the corpus year
year = <"####">
print("Parsing year:",year,"\n")

# Setting the list of databases to parse
db_list = [bp.SCOPUS, bp.WOS]

# Building the working folder architecture for a corpus single year "year" and getting useful paths
config_tup = bp.set_user_config(year, db_list)
rawdata_path_dict, parsing_path_dict = config_tup[1], config_tup[2]

scopus_raw_path     = rawdata_path_dict[bp.SCOPUS]
scopus_parsing_path = parsing_path_dict[bp.SCOPUS]
print(f"\nPut the 'csv' file extracted from the Scopus database in:\n   {scopus_raw_path}")
print(f"\nThe parsing results for the Scopus rawdata will be saved in:\n   {scopus_parsing_path}")

wos_raw_path     = rawdata_path_dict[bp.WOS]
wos_parsing_path = parsing_path_dict[bp.WOS]
print(f"\nPut the 'txt' file extracted from the WoS database in:\n   {wos_raw_path}")
print(f"\nThe parsing results for the WoS rawdata will be saved in:\n   {wos_parsing_path}")

concat_parsing_path = parsing_path_dict['concat']
print(f"\nThe concatenated parsing results will be saved in:\n   {concat_parsing_path}")

dedup_parsing_path = parsing_path_dict['dedup']
print(f"\nThe deduplicated parsing results will be saved in:\n   {dedup_parsing_path}")   
    
print("\nCell run completed")

In [None]:
# Example of parsing Scopus raw data
# using `biblio_parser_scopus` function of `BiblioParsingScopus` module.
# Then saving results as xlsx files using `save_parsing_dict` 
# and `save_fails_dict` functions of `DemoUtils`module.

# Local library imports
import BiblioParsing as bp

# Setting the corpus year
year = <"####">
print("Parsing year:",year,"\n")

# Setting the user's xlsx files for mormalizing institutions 
# if set to None, use of default files of BiblioParsing_RefFiles folder
user_institute_affiliations_file_path = Path(<your_fullpath_to_institute_affiliations_file>)
user_inst_types_file_path = Path(<your_fullpath_to_inst_types_file>)
user_country_towns_folder_path = Path(<your_fullpath_to_country_towns_folder>)
user_country_towns_file = Path(<your_country_towns_file_name>)

# Setting the list of databases to parse
db_list = [bp.SCOPUS, bp.WOS]

# Building the working folder architecture for a corpus single year "year" and getting useful paths
config_tup = bp.set_user_config(year, db_list)
rawdata_path_dict, parsing_path_dict, item_filename_dict = config_tup[1], config_tup[2], config_tup[3]

# Setting the files type for saving results
save_extent = "xlsx"

# Parsing Scopus rawdata
print("Scopus parsing launched...")
scopus_raw_path = rawdata_path_dict[bp.SCOPUS]
scopus_parsing_dict, scopus_fails_dict = bp.biblio_parser_scopus(scopus_raw_path,
                                                                 inst_filter_list = None,
                                                                 country_affiliations_file_path = user_institute_affiliations_file_path,
                                                                 inst_types_file_path = user_inst_types_file_path,
                                                                 country_towns_file = user_country_towns_file,
                                                                 country_towns_folder_path = user_country_towns_folder_path)
if scopus_parsing_dict:

    # Saving parsing results as xlsx files
    print("Results saving launched...")
    scopus_parsing_path = parsing_path_dict[bp.SCOPUS]
    message = bp.save_parsing_dict(scopus_parsing_dict, scopus_parsing_path, 
                                   item_filename_dict, save_extent)
    print("\n",message)

    # Saving parsing fails as json file
    message = bp.save_fails_dict(scopus_fails_dict, scopus_parsing_path)
    print("\n",message)
    print("\nCell run completed")

else:
    print(bp.set_rawdata_error(bp.SCOPUS, scopus_raw_path, bp.SCOPUS_RAWDATA_EXTENT))     


In [None]:
# Example of parsing WoS rawdata 
# using `biblio_parser_wos` function of `BiblioParsingWos` module.
# Then saving results as xlsx files using `save_parsing_dict` 
# and `save_fails_dict` functions of `DemoUtils`module.

# Local library imports
import BiblioParsing as bp

# Setting the corpus year
year = <"####">
print("Parsing year:",year,"\n")

# Setting the user's xlsx files for mormalizing institutions 
# if set to None, use of default files of BiblioParsing_RefFiles folder
user_institute_affiliations_file_path = Path(<your_fullpath_to_institute_affiliations_file>)
user_inst_types_file_path = Path(<your_fullpath_to_inst_types_file>)
user_country_towns_folder_path = Path(<your_fullpath_to_country_towns_folder>)
user_country_towns_file = Path(<your_country_towns_file_name>)

# Setting the list of databases to parse
db_list = [bp.SCOPUS, bp.WOS]

# Building the working folder architecture for a corpus single year "year" and getting useful paths
config_tup = bp.set_user_config(year, db_list)
rawdata_path_dict, parsing_path_dict, item_filename_dict = config_tup[1], config_tup[2], config_tup[3]

# Setting the files type for saving results
save_extent = "xlsx"

# Parsing WoS rawdata
print("WoS parsing launched...")
wos_raw_path = rawdata_path_dict[bp.WOS]
wos_parsing_dict, wos_fails_dict = bp.biblio_parser_wos(wos_raw_path,
                                                        inst_filter_list = None,
                                                        country_affiliations_file_path = user_institute_affiliations_file_path,
                                                        inst_types_file_path = user_inst_types_file_path,
                                                        country_towns_file = user_country_towns_file,
                                                        country_towns_folder_path = user_country_towns_folder_path)
if wos_parsing_dict:
    
    # Saving parsing results as xlsx files
    print("Results saving launched...")
    wos_parsing_path = parsing_path_dict[bp.WOS]
    message = bp.save_parsing_dict(wos_parsing_dict, wos_parsing_path, 
                                   item_filename_dict, save_extent)
    print("\n",message)

    # Saving parsing fails as json file 
    message = bp.save_fails_dict(wos_fails_dict, wos_parsing_path)
    print("\n",message)
    print("\nCell run completed")
    
else:
    print(bp.set_rawdata_error(bp.WOS, wos_raw_path, bp.WOS_RAWDATA_EXTENT))     

In [None]:
# Example of concatenating and deduplicating Scopus and WoS parsings
# using `concatenate_parsing` and `deduplicate_parsing` functions of `BiblioParsingConcat` module.
# Then saving results as xlsx files using `save_parsing_dict` function of `DemoUtils` module.

# Local library imports
import BiblioParsing as bp

# Setting the corpus year
year = <"####">
print("Parsing year:",year,"\n")

# Setting the user's authors affiliations filter as a list of tuples (institution normalized name, institution column name)
user_inst_filter_list = [(<normalized name 1>, <column name 1>),
                         (<normalized name 2>, <column name 2>),
                         ...]
print("User's institutions filter list:", user_inst_filter_list,"\n")

# Setting the user's xlsx files for mormalizing institutions 
# if set to None, use of default files of BiblioParsing_RefFiles folder
user_institute_affiliations_file_path = Path(<your_fullpath_to_institute_affiliations_file>)
user_country_affiliations_file_path = Path(<your_fullpath_to_country_affiliations_file>)
user_inst_types_file_path = Path(<your_fullpath_to_inst_types_file>)
user_country_towns_folder_path = Path(<your_fullpath_to_country_towns_folder>)
user_country_towns_file = Path(<your_country_towns_file_name>)

# Setting the user's status of building normalized institutions file and raw institutions file after deduplicating parsing
user_norm_inst_status = True

# Setting the list of databases to parse
db_list = [bp.SCOPUS, bp.WOS]

# Building the working folder architecture for a corpus single year "year" and getting useful paths
config_tup = bp.set_user_config(year, db_list)
rawdata_path_dict, parsing_path_dict, item_filename_dict = config_tup[1], config_tup[2], config_tup[3]

# Setting the files type for saving results
save_extent = "xlsx"

# Parsing Scopus rawdata
print("Scopus parsing launched...")
scopus_raw_path = rawdata_path_dict[bp.SCOPUS]
scopus_parsing_dict, _ = bp.biblio_parser_scopus(scopus_raw_path,
                                                 inst_filter_list = None,
                                                 country_affiliations_file_path = user_institute_affiliations_file_path,
                                                 inst_types_file_path = user_inst_types_file_path,
                                                 country_towns_file = user_country_towns_file,
                                                 country_towns_folder_path = user_country_towns_folder_path)

# Parsing WoS rawdata
print("WoS parsing launched...")
wos_raw_path = rawdata_path_dict[bp.WOS]
wos_parsing_dict, _ = bp.biblio_parser_wos(wos_raw_path,
                                           inst_filter_list = None,
                                           country_affiliations_file_path = user_institute_affiliations_file_path,
                                           inst_types_file_path = user_inst_types_file_path,
                                           country_towns_file = user_country_towns_file,
                                           country_towns_folder_path = user_country_towns_folder_path)  

if scopus_parsing_dict and wos_parsing_dict:

    # Parsings concatenation
    print("Concatenation of Scopus and WoS parsings launched...")
    concat_parsing_path = parsing_path_dict['concat']
    concat_parsing_dict = bp.concatenate_parsing(scopus_parsing_dict, wos_parsing_dict,  
                                                 inst_filter_list = user_inst_filter_list)
    _ = bp.save_parsing_dict(concat_parsing_dict, concat_parsing_path, item_filename_dict, save_extent)

    # Parsings deduplication
    print("Deduplication of Scopus and WoS parsings launched...")
    dedup_parsing_path = parsing_path_dict['dedup']
    dedup_parsing_dict = bp.deduplicate_parsing(concat_parsing_dict, 
                                                norm_inst_status = user_norm_inst_status,
                                                inst_types_file_path = user_inst_types_file_path,
                                                country_affiliations_file_path = user_country_affiliations_file_path,
                                                country_towns_file = user_country_towns_file,
                                                country_towns_folder_path = user_country_towns_folder_path)
    message = bp.save_parsing_dict(dedup_parsing_dict, dedup_parsing_path, item_filename_dict, save_extent)
    print("\n",message)
    print("\nCell run completed")

else:
    if not scopus_parsing_dict:
        print(bp.set_rawdata_error(bp.SCOPUS, scopus_raw_path, bp.SCOPUS_RAWDATA_EXTENT))
    if not wos_parsing_dict:
        print(bp.set_rawdata_error(bp.WOS, wos_raw_path, bp.WOS_RAWDATA_EXTENT))


In [None]:
# Example of parsing Scopus and WoS rawdata, then concatenating un deduplicating parsings results
# using `parse_to_dedup` function of `DemoUtils` module.
# Then saving results as tsv files with ".dat" extension and xlsx files
# using `save_parsing_dicts` function of `DemoUtils` module.

# Local library imports
import BiblioParsing as bp

# Setting the corpus year
year = <"####">
print("Parsing year:",year,"\n")

# Setting the user's authors affiliations filter as a list of tuples (institution normalized name, institution column name)
user_inst_filter_list = [(<normalized name 1>, <column name 1>),
                         (<normalized name 2>, <column name 2>),
                         ...]
print("User's institutions filter list:", user_inst_filter_list,"\n")

# Setting the user's xlsx files for mormalizing institutions 
# if set to None, use of default files of BiblioParsing_RefFiles folder
user_institute_affiliations_file_path = Path(<your_fullpath_to_institute_affiliations_file>)
user_country_affiliations_file_path = Path(<your_fullpath_to_country_affiliations_file>)
user_inst_types_file_path = Path(<your_fullpath_to_inst_types_file>)
user_country_towns_folder_path = Path(<your_fullpath_to_country_towns_folder>)
user_country_towns_file = Path(<your_country_towns_file_name>)

# Setting the user's status for building dicts of normalized institutions 
# and of not-yet normalized institutions for further normalization
user_norm_inst_status = True

# Setting the list of databases to parse
db_list = [bp.SCOPUS, bp.WOS]

# Building the working folder architecture for a corpus single year "year" and getting useful paths
config_tup = bp.set_user_config(year, db_list)
rawdata_path_dict, parsing_path_dict, item_filename_dict = config_tup[1], config_tup[2], config_tup[3]

# Setting the rawdata path for Scopus and WoS 
db_raw_dict = {}
db_raw_dict[bp.SCOPUS] = rawdata_path_dict[bp.SCOPUS]
db_raw_dict[bp.WOS]    = rawdata_path_dict[bp.WOS]

# Parsing rawdata of Scopus and WoS database, then concatenate and deduplicate the results
print("Parsing to deduplication of Scopus and WoS data launched...")
parsing_dicts_dict, fails_dicts = bp.parse_to_dedup(year, db_raw_dict, 
                                                    user_inst_filter_list,
                                                    user_norm_inst_status,
                                                    user_istitute_affiliations_file_path,
                                                    user_inst_types_file_path,
                                                    user_country_affiliations_file_path,
                                                    user_country_towns_file,
                                                    user_country_towns_folder_path,
                                                    verbose = False)   
if parsing_dicts_dict:
    
    # Saving results as tsv files with ".dat" extension
    tsv_save_extent = "dat"
    message = bp.save_parsing_dicts(parsing_dicts_dict, parsing_path_dict, 
                                    item_filename_dict, tsv_save_extent, fails_dicts)
    print("\n", message)

    # Saving results as xslx files
    xlsx_save_extent = "xlsx"
    message = bp.save_parsing_dicts(parsing_dicts_dict, parsing_path_dict, 
                                    item_filename_dict, xlsx_save_extent, fails_dicts)
    print("\n", message)
    print("\nCell run completed")
