# Dev of HalConf

## General setting

In [None]:
# Imports of merge_pub_employees module

# Standard Library imports
import os
import warnings
from pathlib import Path

# 3rd party imports
import BiblioParsing as bp
import pandas as pd

# Local imports
import bmfuncts.employees_globals as eg
import bmfuncts.pub_globals as pg
from bmfuncts.config_utils import set_user_config
from bmfuncts.useful_functs import read_parsing_dict
from bmfuncts.rename_cols import build_col_conversion_dic

print("Cell run")

In [None]:
# Working parameters

# Setting working folder
institute = "Liten"
app_path = Path(r"C:\Users\AC265100\Documents\BiblioMeter_App") / Path(institute.upper())
hal_path = app_path / Path("ConfMeter_Files")

# Setting corpus year
year_select = "2024"
year_hal_path = hal_path / Path(year_select)

print("Cell run")

In [None]:
# Employees common parameters (independant of year_select)

# Setting specific aliases
effectifs_root_alias = eg.EMPLOYEES_ARCHI["root"]
effectifs_folder_name_alias = eg.EMPLOYEES_ARCHI["all_years_employees"]
effectifs_file_name_alias = eg.EMPLOYEES_ARCHI["employees_file_name"]
search_depth_init_alias = eg.SEARCH_DEPTH
first_name_col_alias = eg.EMPLOYEES_USEFUL_COLS['first_name']
last_name_col_alias = eg.EMPLOYEES_USEFUL_COLS['name']
full_name_col_alias = eg.EMPLOYEES_ADD_COLS['employee_full_name']

# Setting useful paths independent from year_select
effectifs_root_path   = hal_path / Path(effectifs_root_alias)
effectifs_folder_path = effectifs_root_path / Path(effectifs_folder_name_alias)
all_effectifs_path    = effectifs_folder_path / Path(effectifs_file_name_alias)
hal_all_effectifs_path = effectifs_folder_path / Path("Hal_" + effectifs_file_name_alias)

useful_col_list = list(eg.EMPLOYEES_USEFUL_COLS.values()) + list(eg.EMPLOYEES_ADD_COLS.values())
hal_useful_col_list = useful_col_list + [full_name_col_alias]

print("Cell run")

## 1- Configuring employees database for HalConf
independant of year_select

In [None]:
def _add_sheets_to_workbook(file_full_path, df_to_add, sheet_name):
    """Adds the dataframe 'df_to_add' as sheet named 'sheet_name' 
    to the existing Excel file with full path 'file_full_path'. 

    If the sheet name already exists it is overwritten by the new one.
    """

    with pd.ExcelWriter(file_full_path,  # https://github.com/PyCQA/pylint/issues/3060 pylint: disable=abstract-class-instantiated
                        engine = 'openpyxl',
                        mode = 'a',
                        if_sheet_exists = 'replace') as writer:
        df_to_add.to_excel(writer, sheet_name = sheet_name, index = False)

print("Cell run")

In [None]:
def _set_employees_data_hal(all_effectifs_path, search_depth_init, useful_col_list, corpus_year=None):
    """Sets Intitute employees data.

    This is done through the `update_employees` function imported from 
    `bmfuncts.update_employees` module after check of available 
    files for update (should be single) and check of Institute employees 
    database file.

    Args:
        all_effectifs_path (path): Full path to file of Institute employees database.
        search_depth (int): Initial search depth.
        useful_col_list (list): Columns to read in the file of Institute employees database.
        corpus_year (str): Optional corpus year defined by 4 digits (default: None).
        progress_callback (function): Function for updating \
        ProgressBar tkinter widget status.
    Returns:
        (tup): (employees data (df), adapted search depth (int), \
        list of available years of employees data).    
    """

    # Getting employees df
    useful_col_list = list(eg.EMPLOYEES_USEFUL_COLS.values()) + list(eg.EMPLOYEES_ADD_COLS.values())
    all_effectifs_df = pd.read_excel(all_effectifs_path,
                                     sheet_name = None,
                                     dtype = eg.EMPLOYEES_COL_TYPES,
                                     usecols = useful_col_list,
                                     converters=eg.EMPLOYEES_CONVERTERS_DIC)

    # Identifying available years in employees df
    annees_dispo = [int(x) for x in list(all_effectifs_df.keys())]
    annees_verifiees = annees_dispo
    search_depth = search_depth_init
    if corpus_year:        
        annees_a_verifier = [int(corpus_year) - int(search_depth)
                             + (i+1) for i in range(int(search_depth))]
        annees_verifiees = []
        for i in annees_a_verifier:
            if i in annees_dispo:
                annees_verifiees.append(i)

        if len(annees_verifiees) > 0:
            search_depth = min(int(search_depth), len(annees_verifiees))
        else:
            search_depth = 0
            warning_title = "!!! Attention !!!"
            warning_text  = ("Le nombre d'années disponibles est insuffisant "
                             "dans le fichier des effectifs de l'Institut."
                             "\nLe croisement auteurs-effectifs ne peut être effectué !"
                             "\n1- Complétez le fichier des effectifs de l'Institut ;"
                             "\n2- Puis relancer le croisement auteurs-effectifs.")
            messagebox.showwarning(warning_title, warning_text)
    return (all_effectifs_df, search_depth, annees_verifiees)

print("Cell run")

In [None]:
def _capitalize_name(name):
    name_space_split = name.split(" ")
    name_cap_list = []
    for sub_name in name_space_split:
        sub_name_minus_split = [x.capitalize() for x in sub_name.split("-")]
        sub_name_cap = "-".join(sub_name_minus_split)
        name_cap_list.append(sub_name_cap)
    name_cap = " ".join(name_cap_list)
    return name_cap

print("Cell run")

In [None]:
# Building HAL employees database
tup = _set_employees_data_hal(all_effectifs_path, search_depth_init_alias, useful_col_list)
all_effectifs_df, search_depth, annees_disponibles = tup[0], tup[1], tup[2]

# Setting effectif full name with full first name
sheet_init = True
for annee in all_effectifs_df.keys():
    annee_all_effectifs_df = all_effectifs_df[str(annee)].copy()
    hal_annee_all_effectifs_df = pd.DataFrame()
    for _,row in annee_all_effectifs_df.iterrows():
        first_name_cap = _capitalize_name(row[first_name_col_alias])
        last_name_cap = _capitalize_name(row[last_name_col_alias])
        row[full_name_col_alias] = first_name_cap + " " + last_name_cap
        hal_annee_all_effectifs_df = pd.concat([hal_annee_all_effectifs_df, row.to_frame().T])
    sheet_name = str(annee)
    if sheet_init:
        hal_annee_all_effectifs_df.to_excel(hal_all_effectifs_path, sheet_name=sheet_name, index=False)
        sheet_init = False
    else:
        _add_sheets_to_workbook(hal_all_effectifs_path, hal_annee_all_effectifs_df, sheet_name)

print("Cell run")

## 2- Reading the employees database

In [None]:
# Uses the following functions: _set_employees_data_hal

tup = _set_employees_data_hal(hal_all_effectifs_path, search_depth_init_alias,
                              hal_useful_col_list, year_select)
hal_all_effectifs_df, search_depth, annees_disponibles = tup[0], tup[1], tup[2]

print("Cell run")

## 3- Reading a clean HAL conferences list
L'extraction HAL se fait via HalApyJson en exécutant la cellule ad_hoc de "Demo_HalApyJson"

In [None]:
def read_database_hal(hal_file_path, iso_country_file_path):
    """
    To Do: Define global parameters
    """
    # Setting useful aliases
    unknown_alias = bp.UNKNOWN
    
    # Getting ISO code-country
    code_country_df= pd.read_excel(iso_country_file_path, sheet_name="Base")
    code_country_dict = dict(zip(code_country_df["Code"], code_country_df["English name"]))

    hal_full_df = pd.read_excel(hal_file_path)

    hal_col_list = ['Auteurs', 'Titres', 'Date de publication', 'Journal', 'e-ISSN', 'ISSN',
                    'Conference', 'Date de conference', 'Comite de lecture','Acte de conference',
                    'Affiliations', 'Institutions', 'Depts', 'Organismes',
                    'Type de document', 'Mots clefs', 'DOI', 'Lien url', 'Pays']
    hal_col_select_df = hal_full_df[hal_col_list]
    hal_comm_df = hal_col_select_df[hal_col_select_df['Type de document']=='COMM']
    hal_poster_df = hal_col_select_df[hal_col_select_df['Type de document']=='POSTER']
    hal_conf_df = pd.concat([hal_comm_df, hal_poster_df])

    clean_hal_conf_df = hal_conf_df.copy()
    clean_hal_conf_df.fillna(0, inplace=True)
    clean_hal_conf_df.replace(to_replace=0, value=unknown_alias, inplace=True)
    clean_hal_conf_df.reindex()
    new_hal_conf_df = pd.DataFrame(columns = hal_col_list)
    pub_id = 0
    for _, row in clean_hal_conf_df.iterrows():
        row["Pub_id"] = pub_id
        country_iso = str(row["Pays"]).upper()
        row["Pays"] = code_country_dict[country_iso]
        authors_list = row['Auteurs'].split(",")
        auth_id = 0
        for author in authors_list:
            row["Auteur_id"] = auth_id
            row["Co_auteur Liten"] = author
            row["Premier auteur"] = authors_list[0]
            row["Année de conférence"] = row['Date de conference'][0:4]
            row["Année de publication"] = row['Date de publication'][0:4]
            new_hal_conf_df = pd.concat([new_hal_conf_df, row.to_frame().T])
            auth_id += 1
        pub_id += 1
    order_col_list = ["Pub_id", "Auteur_id", "Co_auteur Liten", "Premier auteur",
                      "Année de publication", "Année de conférence", 'Conference', 'Pays',
                      'Type de document', 'Comite de lecture','Titres', 'DOI',
                      'Mots clefs', 'Acte de conference', 'Lien url',
                      'Date de conference', 'Date de publication', 
                      'Auteurs', 'Affiliations', 'Institutions', 'Depts', 'Organismes'
                    ]
    final_hal_conf_df = new_hal_conf_df[order_col_list]
    
    return final_hal_conf_df

print("Cell run")

In [None]:
# Reading and saving a clean HAL conferences list

# Setting specific filenames and paths independant from year_select
iso_country_file = "Code ISO.xlsx"
hal_country_path = hal_path / Path("Pays")
iso_country_file_path = hal_country_path / Path(iso_country_file)

# Setting specific filenames dependent on year_select
hal_file = f"{year_select} HAL.xlsx"
hal_split_file = "Conf_split_" + hal_file

# Setting specific paths dependent on year_select
year_hal_path = hal_path / Path(year_select)
hal_corpus_path = year_hal_path / Path("HAL corpus")
hal_file_path = hal_corpus_path / Path(hal_file)
hal_conf_file_path = hal_corpus_path / Path(hal_split_file)

# Reading and saving the built HAL conferences list
hal_conf_df = read_database_hal(hal_file_path, iso_country_file_path)
hal_conf_df.to_excel(hal_conf_file_path, index=False)

print("Cell run")

## 3- Merging HAL conferences list with employees database

In [None]:
# Local imports
from bmfuncts.config_utils import set_org_params

org_tup = set_org_params(institute, hal_path)

# Setting useful aliases
orphan_treat_root = pg.ARCHI_ORPHAN["root"]
adds_file_name_alias = pg.ARCHI_ORPHAN["employees adds file"]

print("Cell run")

In [None]:
def _standardize(text):
    # Removing accentuated characters
    new_text = bp.remove_special_symbol(text, only_ascii=True, strip=True)
    return new_text

print("Cell run")

### Checking author names spelling and modifying HAL extraction file

In [None]:
# Uses the functions: _standardize, _capitalize_name

def _check_names_spelling_hal(hal_path, init_df, pub_fullname_col, verbose=None):
    """Replace author names in 'init_df' dataframe by the employee name.

    This is done when a name-spelling discrepency is given in the dedicated 
    Excel file named 'orthograph_file_name' and located in the 'orphan_treat_root' 
    folder of the working folder.

    Args:
        hal_path (path): Full path to working folder.
        init_df (dataframe): Publications list with one row per author \
        where author names should be corrected.
        cols_tup (tup): Tuple of useful column names in 'init_df' dataframe \
        = (full name, last name, first name).
    Returns:
        (dataframe): Publications list with one row per author where \
        spelling of author names have been corrected.
    """

    # Setting useful aliases
    orphan_treat_root    = pg.ARCHI_ORPHAN["root"]
    orthograph_file_name = pg.ARCHI_ORPHAN["orthograph file"]
    
    # Setting useful column names
    ortho_fullname_init_col = "Nom pub complet"
    ortho_fullname_new_col = "Nom eff complet"

    # Setting useful path
    ortho_path = hal_path / Path(orphan_treat_root) / Path(orthograph_file_name)

    # Reading data file targeted by 'ortho_path'
    ortho_col_list = [ortho_fullname_init_col,
                      ortho_fullname_new_col]
    warnings.simplefilter(action = 'ignore', category = UserWarning)
    ortho_df = pd.read_excel(ortho_path, usecols = ortho_col_list)

    new_df = init_df.copy()
    new_df.reset_index(drop=True, inplace=True)
    new_df[pub_fullname_col] = new_df[pub_fullname_col].apply(lambda x: _standardize(x))
    for pub_row_num, pub_row in new_df.iterrows():
        fullname_init = new_df.loc[pub_row_num, pub_fullname_col].lower()
        for ortho_row_num, ortho_row in ortho_df.iterrows():
            fullname_pub_ortho = ortho_df.loc[ortho_row_num, ortho_fullname_init_col].lower()
            fullname_empl_ortho = ortho_df.loc[ortho_row_num, ortho_fullname_new_col]
            if fullname_init == fullname_pub_ortho:                
                fullname_empl_ortho = ortho_df.loc[ortho_row_num, ortho_fullname_new_col]
                names_list = []
                for name in fullname_empl_ortho.split(" "):
                    names_list.append(_capitalize_name(name))
                fullname_new = " ".join(names_list)
                new_df.loc[pub_row_num, pub_fullname_col] = fullname_new
                if verbose:
                    if fullname_init=="Minh-Nhut Ngo".lower() and "ngo" in fullname_pub_ortho:
                        print("\nfullname_init:", fullname_init)
                        print("fullname_pub_ortho:", fullname_pub_ortho)
                        print("fullname_empl_ortho:", fullname_empl_ortho)
                        print('fullname_new:', fullname_new)
    return new_df

print("Cell run")

In [None]:
# Checking author names spelling and modifying HAL extraction file

# Setting specific file names
mod_hal_file = "mod_" + hal_file

# Setting specific paths
mod_hal_path = hal_corpus_path / Path(mod_hal_file)

# Setting specific aliases
co_author_alias = "Co_auteur Liten"

mod_hal_conf_df = _check_names_spelling_hal(hal_path, hal_conf_df, co_author_alias, verbose=False)
mod_hal_conf_df.to_excel(mod_hal_path, index=False)

print("Cell run")

### Merging with employees database and external PhD students database

In [None]:
def _add_ext_docs_hal(init_submit_df, init_orphan_df, ext_docs_path,
                      co_author_join_col, ext_docs_full_name_col,
                      verbose=False):
    # Setting sheet name to read in complementary employees file
    ext_docs_sheet_name_alias = pg.SHEET_NAMES_ORPHAN["docs to add"]

    # Setting useful aliases
    converters_alias = eg.EMPLOYEES_CONVERTERS_DIC
    firstname_initials_col_alias = eg.EMPLOYEES_ADD_COLS['first_name_initials']

    # Setting useful column names
    ext_docs_useful_col_list = eg.EXT_DOCS_USEFUL_COL_LIST.copy()
    col_name_to_change = ext_docs_useful_col_list[-2]
    ext_docs_useful_col_list[-2] = col_name_to_change[:-2]

    # Reading of the external phd students excel file
    # using the same useful columns as init_submit_df defined by EXT_DOCS_USEFUL_COL_LIST
    # with dates conversion through converters_alias
    # and drop of empty rows
    warnings.simplefilter(action = 'ignore', category = UserWarning)
    ext_docs_df = pd.read_excel(ext_docs_path,
                                sheet_name=ext_docs_sheet_name_alias,
                                usecols=ext_docs_useful_col_list,
                                converters=converters_alias)
    ext_docs_df.rename({firstname_initials_col_alias: firstname_initials_col_alias}, inplace=True)
    ext_docs_df.dropna(how='all', inplace=True)
    ext_docs_df.reset_index(drop=True, inplace=True)
    ext_docs_df[co_author_join_col] = ext_docs_df[ext_docs_full_name_col].apply(lambda x: _standardize(x).lower().replace("-"," "))

#    # for printing intermediate stages
#    # Initializing new_submit_df as copy of init_submit_df    
#    new_submit_df = init_submit_df.copy()
#    
#    co_author_col = "Co_auteur Liten"
#    for orph_idx, orph_row in init_orphan_df.iterrows():
#        orphan_name = init_orphan_df.loc[orph_idx, co_author_col]
#        orphan_name_join = init_orphan_df.loc[orph_idx, co_author_join_col]
#            
#        for edoc_idx, edoc_row in ext_docs_df.iterrows():
#            edoc_name = ext_docs_df.loc[edoc_idx, ext_docs_full_name_col]
#            edoc_name_join = ext_docs_df.loc[edoc_idx, co_author_join_col]
#
#            if orphan_name_join==edoc_name_join:
#                if "yefsah" in edoc_name_join and verbose: print("\n",orphan_name_join)
#                orph_row_df = orph_row.to_frame().T
#                edoc_row_df = edoc_row.to_frame().T
#                submit_adds_df = orph_row_df.merge(edoc_row_df, how='inner', on=co_author_join_col)
#                new_submit_df = pd.concat([new_submit_df, submit_adds_df])
#                
#                if verbose:
#                    if orphan_name_join=="Lydia YEFSAH".lower() or "yefsah" in edoc_name:
#                        print("\norphan_name:", orphan_name)
#                        print("orphan_name_join:", orphan_name_join)
#                        print("edoc_name:", edoc_name)
#                        print('edoc_name_join:', edoc_name_join)

    submit_adds_df = init_orphan_df.merge(ext_docs_df, how='inner', on=co_author_join_col)
    new_submit_df = pd.concat([init_submit_df, submit_adds_df])
    new_submit_df.sort_values(by=["Pub_id", "Auteur_id"], inplace=True)
    to_drop_df = new_submit_df[init_orphan_df.columns]
    new_orphan_df = pd.concat([init_orphan_df, to_drop_df, to_drop_df]).drop_duplicates(keep=False)

    return new_submit_df, new_orphan_df

print("Cell run")

In [None]:
# Setting specific aliases
co_author_join_alias = "co_author_join"
pub_id_alias = "Pub_id"
author_id_alias = "Auteur_id"

# Setting specific file and folder names
hal_submit_file = "submit_final.xlsx"
hal_orphan_file = "orphan_final.xlsx"
bdd_multi_mensuelle_folder = "0 - BDD multi mensuelle"

# Setting specific paths independant from year_select
hal_ext_docs_path = hal_path / Path(orphan_treat_root) / Path(adds_file_name_alias)

# Setting specific paths dependent on year_select
hal_submit_path = year_hal_path / Path(bdd_multi_mensuelle_folder)
hal_orphan_path = year_hal_path / Path(bdd_multi_mensuelle_folder)
hal_submit_file_path = hal_submit_path / Path(hal_submit_file)
hal_orphan_file_path = hal_orphan_path / Path(hal_orphan_file)

# Initializing orphan df through standardization of co-authors name
orphan_df = mod_hal_conf_df.copy()
orphan_df[co_author_join_alias] = mod_hal_conf_df[co_author_alias].apply(lambda x: _standardize(x).lower().replace("-"," "))
print(len(orphan_df))

# Building the search time depth of Institute co-authors among the employees dataframe
corpus_year_status = year_select in hal_all_effectifs_df.keys()
year_start = int(year_select)
if not corpus_year_status:
    year_start = int(year_select)-1
year_stop = year_start - (search_depth - 1)
years = [str(i) for i in range(year_start, year_stop-1,-1)]

#years = ["2022"]
submit_df = pd.DataFrame()
first_step = True
for empl_year in years:
    empl_df = hal_all_effectifs_df[empl_year].copy()
    empl_df[co_author_join_alias] = empl_df[full_name_col_alias].apply(lambda x: _standardize(x).lower().replace("-"," "))
    
#    # for printing intermediate stages
#    hal_pub_empl_df = pd.DataFrame()
#    for orph_idx, orph_row in orphan_df.iterrows():
#        orphan_name = orphan_df.loc[orph_idx, co_author_alias]
#        orphan_name_join = orphan_df.loc[orph_idx, co_author_join_alias]
#            
#        for empl_idx, empl_row in empl_df.iterrows():
#            empl_name = empl_df.loc[empl_idx, full_name_col_alias]
#            empl_name_join = empl_df.loc[empl_idx, co_author_join_alias]
#                
#            if orphan_name_join==empl_name_join:
#                orph_row_df = orph_row.to_frame().T
#                empl_row_df = empl_row.to_frame().T
#                orphan_empl_df = orph_row_df.merge(empl_row_df, how='inner', on=co_author_join_alias)
#                hal_pub_empl_df = pd.concat([hal_pub_empl_df, orphan_empl_df])
#                
#                if "vito" in orphan_name.lower() and "vito" in empl_name.lower():
#                    print("\norphan_name:", orphan_name)
#                    print("orphan_name_join:", orphan_name_join)
#                    print("empl_name:", empl_name)
#                    print('empl_name_join:', empl_name_join)
#
#    hal_merge_file = empl_year + "_concat_" + hal_file
#    hal_merge_path = hal_corpus_path / Path(hal_merge_file)
#    hal_pub_empl_df.to_excel(hal_merge_path, index=False)                

    submit_adds_df = orphan_df.merge(empl_df, how='inner', on=co_author_join_alias)
    submit_df = pd.concat([submit_df, submit_adds_df])
    submit_df.sort_values(by=[pub_id_alias, author_id_alias], inplace=True)
    to_drop_df = submit_df[orphan_df.columns]
    orphan_df = pd.concat([orphan_df, to_drop_df, to_drop_df]).drop_duplicates(keep=False)
   
    if first_step:
        first_step = False
        submit_df, orphan_df = _add_ext_docs_hal(submit_df, orphan_df, hal_ext_docs_path,
                                                 co_author_join_alias, full_name_col_alias, verbose=False)  

submit_df.to_excel(hal_submit_file_path, index=False)
orphan_df.to_excel(hal_orphan_file_path, index=False)

print("Cell run")

### Adding author job type and saving new 'submit_df'

In [None]:
def _add_author_job_type_hal(in_path, out_path):
    """Adds a new column containing the job type for each author 
    of the publications list with one row per author.

    The job type is got from the employee information available 
    in 3 columns which names are given by 'category_col_alias', 
    'status_col_alias' and 'qualification_col_alias'. 
    The name of the new column is given by 'author_type_col_alias'. 
    The updated publications list is saved as Excel file.

    Args:
        in_path (path):  Full path to the Excel file of the publications list \
        with one row per author with attributes as Institute employee.
        out_path (path): Full path for saving the modified publications list.
    Returns:
        (str): End message recalling the full path to the saved file of \
        the modified publications list.
    """

    # internal functions:
    def _get_author_type(row):
        author_type = '-'
        for col_name, dic in author_types_dic.items():
            for key,values_list in dic.items():
                values_status = [True for value in values_list if value in row[col_name]]
                if any(values_status):
                    author_type = key
        return author_type

    # Setting useful aliases
    category_col_alias      = eg.EMPLOYEES_USEFUL_COLS['category']
    status_col_alias        = eg.EMPLOYEES_USEFUL_COLS['status']
    qualification_col_alias = eg.EMPLOYEES_USEFUL_COLS['qualification']
    author_type_col_alias   = pg.COL_NAMES_BONUS['author_type']

    author_types_dic = {category_col_alias      : eg.CATEGORIES_DIC,
                        status_col_alias        : eg.STATUS_DIC,
                        qualification_col_alias : eg.QUALIFICATION_DIC}

    # Read of the excel file
    submit_df = pd.read_excel(in_path)

    submit_df[author_type_col_alias] = submit_df.apply(_get_author_type, axis=1)

    submit_df.to_excel(out_path, index = False)

    end_message = f"Column with author job type added in file: \n  '{out_path}'"
    return end_message

print("Cell run")

In [None]:
# Adding author job type and saving new submit_df
_add_author_job_type_hal(hal_submit_file_path, hal_submit_file_path)

print("Cell run")

### Adding full article reference and saving new submit_df

In [None]:
def _set_full_ref_hal(title, first_author, conf_name, conf_year, conf_country, pub_year, doi):
    """Builds the full reference of a publication.

    Args:
        title (str): Title of the publication.
        first_author (str): First author of the publication formated as 'NAME IJ' \
        with 'NAME' the lastname and 'IJ' the initials of the firstname of the author.
        journal_name (str): Name of the journal where the publication is published.
        pub_year (str): Publication year defined by 4 digits.
        doi (str): Digital identification of the publication.
    Returns:
        (str): Full reference of the publication.
    """
    full_ref  = f'{title}, '                     # add the reference's title
    full_ref += f'{first_author}. et al., '      # add the reference's first author
    full_ref += f'{conf_name}, '                 # add the reference's conference name
    full_ref += f'{conf_country}-{conf_year}, '       # add the reference's conference country and year
    full_ref += f'{pub_year}'                  # add the reference's publication year
    full_ref += f'{doi}'                         # add the reference's DOI if available
    return full_ref


def _add_biblio_list_hal(in_path, out_path):
    """Adds a new column containing the full reference of each publication 
    of the publications list with one row per author.

    The full reference is built by concatenating the folowing items: 
    title, first author, year, journal, DOI. 
    These items are got from the columns which names are given by 
    'pub_title_alias', 'pub_first_author_alias', 'pub_year_alias', 
    'pub_journal_alias' and 'pub_doi_alias', respectively. 
    The name of the new column is given by 'pub_full_ref_alias'. 
    The updated publications list is saved as Excel file.

    Args:
        in_path (path): Full path to the Excel file of the publications list.
        out_path (path): Full path for saving the modified publications list.
    Returns:
        (str): End message recalling the full path to the saved file \
        of the modified publications list.
    """
    # Setting useful aliases
    unknown_alias = bp.UNKNOWN
    pub_id_alias           = 'Pub_id'
    pub_first_author_alias = "Premier auteur"
    pub_year_alias         = "Année de publication"
    pub_year_conf_alias    = "Année de conférence"
    pub_conf_alias         = "Conference"
    pub_country_alias      = "Pays"
    pub_doi_alias          = "DOI"
    pub_title_alias        = "Titres"
    pub_full_ref_alias     = pg.COL_NAMES_BONUS['liste biblio']

    # Read of the excel file
    submit_df = pd.read_excel(in_path)

    conf_plus_full_ref_df = pd.DataFrame()
    # Splitting the frame into subframes with same Pub_id
    for _, pub_id_df in submit_df.groupby(pub_id_alias):
        # Select the first row and build the full reference
        pub_id_first_row = pub_id_df.iloc[0]
        title        = str(pub_id_first_row[pub_title_alias])
        first_author = str(pub_id_first_row[pub_first_author_alias])
        conf_name    = str(pub_id_first_row[pub_conf_alias])
        conf_year    = str(pub_id_first_row[pub_year_conf_alias])
        conf_country = str(pub_id_first_row[pub_country_alias])
        pub_year     = str(pub_id_first_row[pub_year_alias])
        doi = ""
        if pub_id_first_row[pub_doi_alias]!=unknown_alias:
            doi = ", " + str(pub_id_first_row[pub_doi_alias])
        pub_id_df[pub_full_ref_alias] = _set_full_ref_hal(title, first_author, conf_name,
                                                          conf_year, conf_country, pub_year, doi)
        conf_plus_full_ref_df = pd.concat([conf_plus_full_ref_df, pub_id_df])

    conf_plus_full_ref_df.to_excel(out_path, index = False)

    end_message = f"Column with full reference of conference added in file: \n  '{out_path}'"
    return end_message

print("Cell run")

In [None]:
# Adding full article reference and saving new submit_df
_add_biblio_list_hal(hal_submit_file_path, hal_submit_file_path)

print("Cell run")

### Adding list of Institute authors with attributes

In [None]:
def _add_authors_name_list_hal(institute, org_tup, in_path, out_path):
    """Adds two columns to the dataframe got from the Excel file pointed by 'in_path'.

    The columns contain respectively the full name of each author as "NAME, Firstname" 
    and the institute co-authors list with attributes of each author in a string as follows:

        - "NAME1, Firstame1 (matricule,job type,department affiliation, \
        service affiliation,laboratoire affiliation);
        - NAME2, Firstame2 (matricule,job type,department affiliation, \
        service affiliation,laboratoire affiliation);
        - ...".

    Args:
        institute (str): The Intitute name.
        org_tup (tup): Contains Institute parameters.
        in_path (path): Fullpath of the excel file of the publications list \
        with a row per Institute author and their attributes columns.
        out_path (path): Fullpath of the processed dataframe as an Excel file \
        saved after going through its treatment.
    Returns:
        (str): End message recalling out_path.
    """
    # Internal functions
    def _get_dpt_key(dpt_raw):
        return_key = None
        for key, values in dpt_label_dict.items():
            if dpt_raw in values:
                return_key = key
        return return_key

    # Setting institute parameters
    dpt_label_dict = org_tup[1]

    # Setting useful aliases
    pub_id_alias         = "Pub_id"
    idx_authors_alias    = "Auteur_id"
    nom_alias            = "Nom"
    prenom_alias         = "Prénom"
    matricule_alias      = "Matricule"
    full_name_alias      = "Nom complet"
    author_type_alias    = "Type de l'auteur"
    full_name_list_alias = "Liste ordonnée des auteurs Liten"
    dept_alias           = "Dpt/DOB (lib court)"
    serv_alias           = "Service (lib court)"
    lab_alias            = "Laboratoire (lib court)"

    # Reading the excel file
    df_in = pd.read_excel(in_path)

    # Adding the column 'full_name_alias' that will be used to create the authors fullname list
    df_in[prenom_alias]    = df_in[prenom_alias].apply(lambda x: x.capitalize())
    df_in[full_name_alias] = df_in[nom_alias] + ', ' + df_in[prenom_alias]

    df_out = pd.DataFrame()
    for _, pub_id_df in df_in.groupby(pub_id_alias):
        raw_depts_list = pub_id_df[dept_alias].to_list()
        depts_list = [_get_dpt_key(x) for x in raw_depts_list]
        for dept in dpt_label_dict.keys():
            pub_id_df[dept] = 0
            if dept in depts_list:
                pub_id_df[dept] = 1
        
        authors_tup_list = sorted(list(set(zip(pub_id_df[idx_authors_alias],
                                               pub_id_df[full_name_alias],
                                               pub_id_df[matricule_alias],
                                               pub_id_df[author_type_alias],
                                               pub_id_df[dept_alias],
                                               pub_id_df[serv_alias],
                                               pub_id_df[lab_alias]))))

        authors_str_list = [(f'{x[1]} ({x[2]},'
                             f'{x[3]},{_get_dpt_key(x[4])},{x[5]},{x[6]})')
                            for x in authors_tup_list]
        authors_full_str = "; ".join(authors_str_list)
        pub_id_df[full_name_list_alias] = authors_full_str

        df_out = pd.concat([df_out, pub_id_df])

    # Saving 'df_out' in an excel file 'out_path'
    df_out.to_excel(out_path, index = False)

    end_message = f"Column with co-authors list is added to the file: \n  '{out_path}'"
    return end_message, df_out

print("Cell run")

In [None]:
# Adding list of Institute authors with attributes
_, submit_df = _add_authors_name_list_hal(institute, org_tup, hal_submit_file_path, hal_submit_file_path)

print("Cell run")

### Setting unique pub-ID

In [None]:
def _unique_pub_id(df, pub_id_col, year_col, shift):
    """Transforms the column 'Pub_id' of df y adding "yyyy_" 
    (year in 4 digits) to the values.

    Args:
        df (pandas.DataFrame): data that we want to modify.
    Returns:
        (pandas.DataFrame): df with its changed column.
    """
    year_df = df[year_col].iloc[0]

    def _rename_pub_id(old_pub_id, year):
        pub_id_str = str(int(old_pub_id) + shift)
        while len(pub_id_str)<3:
            pub_id_str = "0" + pub_id_str
        new_pub_id = str(int(year)) + '_' + pub_id_str
        return new_pub_id

    df[pub_id_col] = df[pub_id_col].apply(lambda x: _rename_pub_id(x, year_df))
    return df

print("Cell run")

In [None]:
# Setting unique pub-ID
shift = 500
_unique_pub_id(submit_df, 'Pub_id', 'Année de publication', shift)

print("Cell run")

### Selecting columns for final conferences list

In [None]:
# Selecting columns for final list
final_col_list = ['Pub_id','Année de publication', 'Année de conférence', 'Premier auteur', 'Liste ordonnée des auteurs Liten',
                  'Titres', 'Conference', 'Type de document', 'DOI', 'Référence bibliographique complète',
                  'Pays', 'DEHT', 'DTCH', 'DTNM', 'DTS', 'DIR', 'Comite de lecture', 'Acte de conference', 'Lien url']
sub_submit_df = submit_df[final_col_list]
conf_list_df = sub_submit_df.drop_duplicates(['Pub_id', 'Premier auteur', 'Année de conférence', 'Conference', 'Pays',
                                              'Type de document', 'Comite de lecture', 'Titres',])
final_conf_list_df = conf_list_df.rename(columns={'Titres': 'Titre'})

print("Cell run")

### Formatting and saving final conferences list as openpyxl workbook

In [None]:
# 3rd party imports
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows \
    as openpyxl_dataframe_to_rows
from openpyxl.utils import get_column_letter \
    as openpyxl_get_column_letter
from openpyxl.styles import Font as openpyxl_Font
from openpyxl.styles import PatternFill as openpyxl_PatternFill
from openpyxl.styles import Alignment as openpyxl_Alignment
from openpyxl.styles import Border as openpyxl_Border
from openpyxl.styles import Side as openpyxl_Side


def hal_set_col_attr():
    """Sets the dict for setting the final column attributes 
    in terms of width and alignment to be used for formating 
    datarames before openpyxl save.

    The final column names are got through the 
    `build_col_conversion_dic` internal function.

    Args:
        None
    Returns:
        (tup): (dict to be used for setting the final column \
        attributes for formating datarames before openpyxl save, \
        list of the final column names that have attributes).
    """
    col_attr = {'Pub_id'                             : [20, "center"],
                'Année de publication'               : [15, "center"],
                'Année de conférence'                : [15, "center"],
                'Premier auteur'                     : [20, "center"],
                'Liste ordonnée des auteurs Liten'   : [40, "left"],
                'Titre'                              : [40, "left"],
                'Conference'                         : [40, "left"],
                'Type de document'                   : [15, "center"],
                'DOI'                                : [20, "center"],
                'Référence bibliographique complète' : [55, 'left'],
                'Pays'                               : [15, "center"],
                'DEHT'                               : [10, "center"],
                'DTCH'                               : [10, "center"],
                'DTNM'                               : [10, "center"],
                'DTS'                                : [10, "center"],
                'DIR'                                : [10, "center"],
                'Comite de lecture'                  : [10, "center"],
                'Acte de conference'                 : [15, "center"],
                'Lien url'                           : [50, "left"]
               }
    col_set_list = list(col_attr.keys())
    col_attr['else'] = [15, "center"]
    
    return col_attr, col_set_list


def _hal_mise_en_page(df, wb = None):
    """Formats a worksheet of an openpyxl workbook using 
    columns attributes got through the `set_col_attr` function 
    imported from the `bmfuncts.rename_cols` module.

    When the workbook wb is not None, this is applied 
    to the active worksheet of the passed workbook. 
    If the workbook wb is None, then the workbook is created.

    Args:
        df (dataframe): The dataframe to be formatted.
        wb (openpyxl workbook): Worbook of the worksheet \
        to be formatted (default = None).
    Returns:
        (tup): (worbook of the formatted worksheet (openpyxl workbook), \
        formatted active sheet).
    """

    # Setting useful column sizes
    col_attr, col_set_list = hal_set_col_attr()
    columns_list = list(df.columns)
    for col in columns_list:
        if col not in col_set_list:
            col_attr[col] = col_attr['else']

    # Setting list of cell colors
    cell_colors = [openpyxl_PatternFill(fgColor = pg.ROW_COLORS['odd'], fill_type = "solid"),
                   openpyxl_PatternFill(fgColor = pg.ROW_COLORS['even'], fill_type = "solid")]

    # Initialize wb as a workbook and ws its active worksheet
    if not wb:
        wb = Workbook()
    ws = wb.active
    ws_rows = openpyxl_dataframe_to_rows(df, index=False, header=True)

    # Coloring alternately rows in ws using list of cell colors cell_colors
    for idx_row, row in enumerate(ws_rows):
        ws.append(row)
        last_row = ws[ws.max_row]
        if idx_row >= 1:
            cell_color = cell_colors[idx_row%2]
            for cell in last_row:
                cell.fill = cell_color

    # Setting cell alignment and border using dict of column attributes col_attr
    for idx_col, col in enumerate(columns_list):
        column_letter = openpyxl_get_column_letter(idx_col + 1)
        for cell in ws[column_letter]:
            cell.alignment = openpyxl_Alignment(horizontal=col_attr[col][1],
                                                vertical="center")
            cell.border = openpyxl_Border(left=openpyxl_Side(border_style='thick',
                                                             color='FFFFFF'),
                                          right=openpyxl_Side(border_style='thick',
                                                              color='FFFFFF'))

    # Setting the format of the columns heading
    cells_list = ws['A'] + ws[1]
    for cell in cells_list:
        cell.font = openpyxl_Font(bold=True)
        cell.alignment = openpyxl_Alignment(wrap_text=True, horizontal="center",
                                            vertical="center")

    # Setting de columns width using dict 
    # of column attributes 'col_attr' 
    for idx_col, col in enumerate(columns_list):
        if idx_col >= 1:
            column_letter = openpyxl_get_column_letter(idx_col + 1)
            if col in col_attr.keys():
                ws.column_dimensions[column_letter].width = col_attr[col][0]
            else:
                ws.column_dimensions[column_letter].width = 20


    # Setting height of first row
    first_row_num = 1
    ws.row_dimensions[first_row_num].height = 50

    return wb, ws

print("Cell run")

In [None]:
# Formating and saving final list as workbook
wb, _ = _hal_mise_en_page(final_conf_list_df)

hal_results_path = year_hal_path / Path("3 - Résultats Finaux")
conf_list_path = hal_results_path / Path("Liste consolidée " + year_select + "_Conférences.xlsx")
wb.save(conf_list_path)

print("Cell run")