In [2]:
# =============================================
# IMPORT LIBRARIES
# =============================================

import numpy as np
import pandas as pd
import os
os.chdir('..')
import pickle
import json

from citations_lib.create_fig_helper_functions import *
from citations_lib.utils import *
from citations_lib.author_vs_group_layout import *
from citations_lib.group_vs_group_layout import *
from citations_lib.author_vs_author_layout import *
from citations_lib.metric_tab_layout import *

import dash
import dash.html as html
import dash_leaflet as dl
from jupyter_dash import JupyterDash
#from dash_extensions.javascript import arrow_function, assign, Namespace
import country_converter as coco
cc = coco.CountryConverter()

# ======================a=======================
# Data Preparation
# =============================================

dfs_career, dfs_singleyr, dfs_career_log, dfs_singleyr_log, dfs_career_text, dfs_singleyr_text, dfs_career_yrs, dfs_singleyr_yrs = load_standardized_data('/Users/agah/Desktop/neuropoly/no_cite-isfaction/data/')

In [3]:
import numbers

def sum_stuff(stuff):
    ln = len(stuff)
    stuff = np.array(stuff)
    #q= stuff.percentile([25, 50, 75])
    q =  list(np.percentile(stuff, [25, 50, 75]))
    mn = stuff.min()
    mx = stuff.max()
    return [mn,q[0],q[1],q[2],mx,ln]


career_versions = [1,1,2,3,5]
singleyr_versions = [1,2,3,5]

def process_data_by_country(dfs, dfs_log, aggmetric, type):
    result_dict = {}

    if type == 'career':
        yrs = [2017, 2018, 2019, 2020, 2021]
        prefix= 'career'
    else:        
        yrs = [2017, 2019, 2020, 2021]
        prefix = 'singleyr'
    
    #for index, (dict1_item, dict2_item, dict3_item) in enumerate(zip(dict1.items(), dict2.items(), dict3.items())):
    for yr, (df,df_log) in enumerate(zip(dfs,dfs_log)):
        ns = True
        metrics_list = ['rank', 'c', 'nc','h', 'hm', 'ncs','ncsf','ncsfl','nps','cpsf','npsfl','npciting']      
        metrics_list += [i + ' (ns)' if ns else i for i in metrics_list]
        metrics_list += ['np' ,'self%']

        # Filter the dataframe for the specified country
        df = df.dropna(subset=[aggmetric])
        df_log = df_log.dropna(subset=[aggmetric])

        unique_fields = list(set(df[aggmetric]))
        print(len(unique_fields))
        curkey = f'{prefix}_{yrs[yr]}'
        curkey_log = f'{prefix}_{yrs[yr]}_log'
        print(curkey)
            
        for field in unique_fields:
            if field not in result_dict:
                result_dict[field] = {}
            
            # Filter the dataframe for the current year and field
            filtered_data = df[df[aggmetric] == field]
            filtered_data_log = df_log[df_log[aggmetric] == field]
            
            # Get all metrics for the current field
            metrics = filtered_data.columns
            if curkey not in result_dict[field]:
                result_dict[field][curkey] = {}
                result_dict[field][curkey_log] = {}
            for metric in metrics:
                if (metric not in result_dict[field][curkey]) and (metric in metrics_list):
                    result_dict[field][curkey][metric] = []
                    result_dict[field][curkey_log][metric] = []
                
                # Append the values to the corresponding metric list
                if metric  in metrics_list:
                    result_dict[field][curkey][metric].extend(sum_stuff(filtered_data[metric].tolist()))
                    result_dict[field][curkey_log][metric].extend(sum_stuff(filtered_data_log[metric].tolist()))
    print("DONE")
    return result_dict

# Example usage:



## Career aggregate data

In [10]:
# COUNTRY
result_dict_cntry = process_data_by_country(dfs_career, dfs_career_log, 'cntry','career')
pickle.dump(result_dict_cntry, open("/Users/agah/Desktop/neuropoly/no_cite-isfaction/career_aggregate_cntry.p", "wb"))

In [31]:
# FIELD
result_dict_field = process_data_by_country(dfs_career, dfs_career_log, 'sm-field','career')
pickle.dump(result_dict_field, open("/Users/agah/Desktop/neuropoly/no_cite-isfaction/career_aggregate_field.p", "wb"))

In [32]:
# INSTITUTION (TAKES LONGER)
result_dict_inst = process_data_by_country(dfs_career, dfs_career_log, 'inst_name','career')
pickle.dump(result_dict_inst, open("/Users/agah/Desktop/neuropoly/no_cite-isfaction/career_aggregate_inst2.p", "wb"))

## Singleyr aggregate data

In [33]:
# INSTITUTION (TAKES LONGER)
result_dict_inst = process_data_by_country(dfs_singleyr, dfs_singleyr_log, 'inst_name','singleyr')
pickle.dump(result_dict_inst, open("/Users/agah/Desktop/neuropoly/no_cite-isfaction/singleyr_aggregate_inst.p", "wb"))

In [34]:
# COUNTRY
result_dict_inst = process_data_by_country(dfs_singleyr, dfs_singleyr_log, 'cntry','singleyr')
pickle.dump(result_dict_inst, open("/Users/agah/Desktop/neuropoly/no_cite-isfaction/singleyr_aggregate_cntry.p", "wb"))

In [35]:
# FIELD
result_dict_inst = process_data_by_country(dfs_singleyr, dfs_singleyr_log, 'sm-field','singleyr')
pickle.dump(result_dict_inst, open("/Users/agah/Desktop/neuropoly/no_cite-isfaction/singleyr_aggregate_field.p", "wb"))

# Generating `.pkl` files to save aggregate info (speed up loading stuff)

### Resources: dash leaflet
* Map from tutorial [here](https://github.com/WestHealth/pydataglobal-2021)
* Other cool library [here](https://github.com/plotly/dash-deck)

### Resources: custom GeoJSON Map
* Make custom.geo.json from [here](https://geojson-maps.ash.ms/)
* Code below based on [this](https://towardsdatascience.com/how-to-create-outstanding-custom-choropleth-maps-with-plotly-and-dash-49ac918a5f05)

In [10]:
def create_info_dict(career, yr):
    metrics = ['nc', 'h', 'hm',  'ncs', 'ncsf','ncsfl', 'nc (ns)', 'h (ns)', 'hm (ns)',  'ncs (ns)', 'ncsf (ns)','ncsfl (ns)', 'self%']
    dfs = dfs_career.copy() if career == True else dfs_singleyr.copy()

    # save list of authors
    dfs[yr]['authfull'] = dfs[yr]['authfull'].apply(str)
    author_names = sorted(dfs[yr]['authfull'].unique())
    author_names = [author.title() for author in author_names]

    # save list of fields
    dfs[yr]['sm-field'] = dfs[yr]['sm-field'].apply(str)
    field_names = sorted(dfs[yr]['sm-field'].unique()) # alphabetical
    field_names = [field.title() for field in field_names]
    
    # save list of institutions
    dfs[yr]['inst_name'] = dfs[yr]['inst_name'].apply(str)
    institution_names = sorted(dfs[yr]['inst_name'].unique()) # alphabetical
    institution_names = [institution.title() for institution in institution_names]

    # save list of countries in alpha-3
    dfs[yr]['cntry'] = dfs[yr]['cntry'].apply(str)
    country_names = dfs[yr]['cntry'].unique()
    country_names = [str(country).lower() for country in country_names]
    if 'nan' in country_names: country_names.remove('nan')
    if 'csk' in country_names: country_names.remove('csk') 
    if 'sux' in country_names: country_names = list(map(lambda x: x.replace('sux','rus'), country_names))
    if 'ant' in country_names: country_names = list(map(lambda x: x.replace('ant','nld'), country_names))
    if 'scg' in country_names: country_names = list(map(lambda x: x.replace('scg','cze'), country_names))
    country_names = list(set(country_names)) # since above 3 lines may have created duplicates
    country_names = sorted(country_names) # alphabetical
    
    # save full country_names
    country_names_full = coco.convert(names=country_names, to='name_short') # do NOT sort these alphabetically!
    
    # dictionary with all of this information
    info = dict()
    # save basic stats on c-score metrics (not done as it may not even speed stuff up?)
    for metric in metrics:
        info[metric + ' min'] = int(dfs[yr][metric].min())
        info[metric + ' max'] = int(dfs[yr][metric].max())
        info[metric + ' mean'] = round(dfs[yr][metric].mean(),2)
        info[metric + ' std'] = round(dfs[yr][metric].std(),2)
    info['authfull'] = author_names
    info['cntry'] = country_names
    info['inst_name'] = institution_names
    info['sm-field'] = field_names
    info['cntry_full'] = country_names_full
    
    # save files
    f_out = 'career' if career == True else 'singleyr'
    with open(f'/Users/agah/Desktop/neuropoly/no_cite-isfaction/data/aggregate_info/info_new_{f_out}_{yr}.pkl', 'wb') as fp:
        pickle.dump(info, fp)
        print('dictionary saved successfully to file')
        
for i in range(4):
    create_info_dict(career = False, yr = i)

for i in range(5):
    create_info_dict(career = True, yr = i)

dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
dictionary saved successfully to file
