# Pip install pour ce jupyter à la construction du venv
- pip install ipykernel==6.28.0
- pip install XlsxWriter==3.1.9

Reconstruire le kernel 
- python -m ipykernel install --user --name=BiblioParsing_ker


In [None]:
# Example of parsing Scopus data

# Standard library imports
import json
from pathlib import Path

# Local library imports
import BiblioParsing as bp

# Setting the user's filenames (values) for the parsing items (keys)
item_filename_dict = {'AD' : 'addresses', 
                      "ADI": "addresses_institutions", # not yet parsed
                      'A'  : 'articles',
                      'AU' : 'authors',
                      'I2' : 'authorsinst',
                      'AK' : 'authorskeywords',
                      'CU' : 'countries',
                      'I'  : 'institutions',
                      'IK' : 'journalkeywords',
                      'TK' : 'titlekeywords',
                      'S'  : 'subjects',
                      'S2' : 'subjects2',
                      'I3' : 'rawinstitutions',        # not yet parsed
                      'R'  : 'references'}
    
# Parsing Scopus rawdata
print("Scopus parsing launched...")
#scopus_raw_path = Path(<your_fullpath_file_to_scopus_rawdata>)
scopus_raw_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/scopus/rawdata")
scopus_parsing_dict, scopus_fails_dict = bp.biblio_parser(scopus_raw_path, bp.SCOPUS) 

# Saving parsing results as xlsx files
print("Results saving launched")
#scopus_parsing_path = Path(<your_fullpath_for_scopus_parsing_results>)
scopus_parsing_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/scopus/parsing")
bp.save_parsing_xlsx(bp.SCOPUS, scopus_parsing_dict, scopus_parsing_path, item_filename_dict)

# Saving parsing fails as json file            
with open(scopus_parsing_path / Path('failed.json'), 'w') as write_json:
    json.dump(scopus_fails_dict, write_json, indent=4)


In [None]:
# Example of parsing WoS data

# Standard library imports
import json
from pathlib import Path

# Local library imports
import BiblioParsing as bp

# Setting the user's filenames (values) for the parsing items (keys)
item_filename_dict = {'AD' : 'addresses', 
                      "ADI": "addresses_institutions", # not yet parsed
                      'A'  : 'articles',
                      'AU' : 'authors',
                      'I2' : 'authorsinst',
                      'AK' : 'authorskeywords',
                      'CU' : 'countries',
                      'I'  : 'institutions',
                      'IK' : 'journalkeywords',
                      'TK' : 'titlekeywords',
                      'S'  : 'subjects',
                      'S2' : 'subjects2',
                      'I3' : 'rawinstitutions',
                      'R'  : 'references'}
    
# Parsing WoS rawdata
print("WoS parsing launched...")
#wos_raw_path = Path(<your_fullpath_file_to_wos_rawdata>)
wos_raw_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/wos/rawdata")
wos_parsing_dict, wos_fails_dict = bp.biblio_parser(wos_raw_path, bp.WOS) 

# Saving parsing results as xlsx files
print("Results saving launched")
#wos_parsing_path = Path(<your_fullpath_for_wos_parsing_results>)
wos_parsing_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/wos/parsing")
bp.save_parsing_xlsx(bp.WOS, wos_parsing_dict, wos_parsing_path, item_filename_dict)

# Saving parsing fails as json file            
with open(wos_parsing_path / Path('failed.json'), 'w') as write_json:
    json.dump(wos_fails_dict, write_json, indent=4)


In [2]:
# Example of concatenating and deduplicating Scopus and WoS parsings

# Standard library imports
from pathlib import Path

# Local library imports
import BiblioParsing as bp

# Setting the user's filenames (values) for the parsing items (keys)
item_filename_dict = {'AD' : 'addresses', 
                      "ADI": "addresses_institutions", # not yet parsed
                      'A'  : 'articles',
                      'AU' : 'authors',
                      'I2' : 'authorsinst',
                      'AK' : 'authorskeywords',
                      'CU' : 'countries',
                      'I'  : 'institutions',
                      'IK' : 'journalkeywords',
                      'TK' : 'titlekeywords',
                      'S'  : 'subjects',
                      'S2' : 'subjects2',
                      'I3' : 'rawinstitutions',
                      'R'  : 'references'}

# Setting the user's authors affiliations filter as a list of tuples (institution,country)
#user_inst_filter_list = [(<institution1>,<country1),(<institution2>,<country2),...]
user_inst_filter_list = [('LITEN','France'),('INES','France')]

# Parsing Scopus rawdata
print("Scopus parsing launched...")
#scopus_raw_path = Path(<your_fullpath_file_to_scopus_rawdata>)
scopus_raw_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/scopus/rawdata")
scopus_parsing_dict, _ = bp.biblio_parser(scopus_raw_path, bp.SCOPUS) 

# Parsing WoS rawdata
print("WoS parsing launched...")
#wos_raw_path = Path(<your_fullpath_file_to_wos_rawdata>)
wos_raw_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/wos/rawdata")
wos_parsing_dict, _ = bp.biblio_parser(wos_raw_path, bp.WOS) 

# Parsings concatenation
print("Concatenation of Scopus and WoS parsings launched...")
#concat_parsing_path = Path(<your_fullpath_for_parsings_concat_results>)
concat_parsing_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/concatenation/parsing")
concat_parsing_dict = bp.concatenate_parsing(scopus_parsing_dict, wos_parsing_dict,  
                                             inst_filter_list = user_inst_filter_list)
bp.save_parsing_xlsx('concatenation', concat_parsing_dict, concat_parsing_path, item_filename_dict)

# Parsings dediplication
print("Deduplication of Scopus and WoS parsings launched...")
#dedup_parsing_path = Path(<your_fullpath_for_parsings_dedup_results>)
dedup_parsing_path = Path("/Users/amal/BiblioParsing_Files/2021/Corpus/deduplication/parsing")
dedup_parsing_dict = bp.deduplicate_parsing(concat_parsing_dict)
bp.save_parsing_xlsx('deduplication', dedup_parsing_dict, dedup_parsing_path, item_filename_dict)

Scopus parsing launched...
WoS parsing launched...
Concatenation of Scopus and WoS parsings launched...
Deduplication of Scopus and WoS parsings launched...


'All deduplication parsing results saved as xlsx files'

In [None]:
# Getting the working folder  
# as set in the json file "~\Appdata\Roaming\BiblioParsing\BiblioParsing_config.json" 

# Local library imports
import BiblioParsing as bp

# Getting the working folder architecture
parsing_folder_dict = bp.DEMO_GLOBAL['PARSING_FOLDER_ARCHI']

# Setting the working folder name 
working_folder_name = parsing_folder_dict['folder_root']
print(f"\nWorking folder name: {working_folder_name}") 


In [None]:
# Example of building the architecture of the working folder in the user's path home,
# and getting useful paths for demo as described in the json file:
#   "~\Appdata\Roaming\BiblioParsing\BiblioParsing_config.json"
# This check if the conbfiguration file is correctly filled. 

# Standard library imports
from pathlib import Path

# Local library imports
import BiblioParsing as bp

# Getting the working folder architecture
parsing_folder_dict = bp.DEMO_GLOBAL['PARSING_FOLDER_ARCHI']

# Setting the corpus year
year = "2021"

# Setting useful paths for demo
root_path = Path.home()

# Building the working folder architecture and getting useful paths
rawdata_path_dict, parsing_path_dict, db_dict = bp.build_files_paths(year, parsing_folder_dict, root_path)

if (bp.SCOPUS in db_dict.values()) and (bp.WOS in db_dict.values()):
    scopus_raw_path     = rawdata_path_dict[bp.SCOPUS]
    scopus_parsing_path = parsing_path_dict[bp.SCOPUS]
    print(f"\nPut the 'csv' file extracted from the Scopus database in:\n   {scopus_raw_path}")
    print(f"\nThe parsing results for the Scopus rawdata will be saved in:\n   {scopus_parsing_path}")
    
    wos_raw_path     = rawdata_path_dict[bp.WOS]
    wos_parsing_path = parsing_path_dict[bp.WOS]
    print(f"\nPut the 'txt' file extracted from the WoS database in:\n   {wos_raw_path}")
    print(f"\nThe parsing results for the WoS rawdata will be saved in:\n   {wos_parsing_path}")

    concat_parsing_path = parsing_path_dict['concat']
    print(f"\nThe concatenated parsing results will be saved in:\n   {concat_parsing_path}")

    dedup_parsing_path = parsing_path_dict['dedup']
    print(f"\nThe deduplicated parsing results will be saved in:\n   {dedup_parsing_path}")   
    
else:
    message  = f"\nPlease check that {bp.SCOPUS} database is correctely labelled "
    message += f"in your config json file:\n    ~\Appdata\Roaming\BiblioParsing\BiblioParsing_config.json"
    print(message)

In [None]:
# Example of parsing Scopus and WoS rawdata after:
# - building the architecture of the working folder in the user's path home 
# - and setting the paths to the rawdata extracted from the databases for parsing demo.
# Then, concatenating and deduplicating the parsings.
# Finally, saving the parsing results in the working folder using filenames defined in the json file:
#   "~\Appdata\Roaming\BiblioParsing\BiblioParsing_config.json" 

# Standard library imports
from pathlib import Path

# Local library imports
import BiblioParsing as bp

# Getting the working folder architecture 
parsing_folder_dict = bp.DEMO_GLOBAL['PARSING_FOLDER_ARCHI']

# Setting the corpus year
year = "2021"

# Setting useful paths for demo
root_path = Path.home()

# Building the working folder architecture and getting useful paths
rawdata_path_dict, parsing_path_dict, _ = bp.build_files_paths(year, parsing_folder_dict, root_path)

# Setting the rawdata path for Scopus and WoS if part of the databases to parse 
db_raw_dict = {}
db_raw_dict[bp.SCOPUS] = rawdata_path_dict[bp.SCOPUS]
db_raw_dict[bp.WOS]    = rawdata_path_dict[bp.WOS]

# Parsing rawdata of Scopus and WoS database, then concatenate and deduplicate the results
#parsing_dicts_dict, fails_dicts = bp.parse_to_dedup(year, db_raw_dict, verbose = True)

# Getting the filenames for each parsing item
parsing_filenames_dict = bp.DEMO_GLOBAL['PARSING_FILE_NAMES']
item_filename_dict     = bp.build_item_filename_dict(parsing_filenames_dict)

tsv_extent = "dat"
#message = save_parsing_dicts(parsing_dicts_dict, parsing_path_dict, item_filename_dict, tsv_extent)
#print("\n", message)