# __8.2 Considering country impact__

Goal:
- Access the impacts of pubs over time (1999-2020) for different:
  - Topics in 8.1
  - Countries

Approach
- The averged impact metric is calcualted for:
  - Each year, combining all countries
    - See `impact_overall.xlsx`
  - Each year, each country
    - See `impact_country.xlsx` for original values
    - See `impact_country_MOD.xlsx` for values normalized in two schemes
      - Against all country average each year
      - Against top 10 country average each year 

Thoughts
- In all metrics, GBR has consistently high impacts
  - While, CHN and IND has lower than average impact throughout the years
    - Nonetheless, the impact in all every areas are approching global average


## ___Setup___

### Module import

In conda env `base`

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from urllib import request
from time import sleep

### Key variables

In [107]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
parent_dir = proj_dir / "8_impact"
work_dir   = parent_dir / "8_2_country"
work_dir.mkdir(parents=True, exist_ok=True)

# plant science corpus with topic assignment info
dir42      = proj_dir / "4_topic_model/4_2_outlier_assign"
corpus_file = dir42 / "table4_2_corpus_with_topic_assignment.tsv.gz"
#corpus_file = dir42 / "test.tsv"

# country info, pmid, ...
dir75            = proj_dir / "7_countries/7_5_country_over_time"
ci_file          = dir75 / 'ci_pmid_topic.tsv'
c_npub_file      = dir75 / 'country_npub_raw.csv'

# SJR and pdjity (pmid, date, journal, issn, topic, year)
pdjity_file      = parent_dir / "table_pdjity.tsv"

# So PDF is saved in a format properly
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams["font.family"] = "sans-serif"

## ___Country, ISSN, and SJR metric data___

### Read country, pmid, topic dataframe

In [7]:
# Dataframe with A3, confidence, date, topic, year
df_acdty = pd.read_csv(ci_file, sep='\t', index_col=0)
df_acdty.shape

(330328, 5)

In [8]:
df_acdty.head(2)

Unnamed: 0_level_0,A3,Confidence,Date,Topic,Year
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
400957,CAN,3,1978-01-01,50,1978
1279107,FRA,3,1992-11-01,12,1992


### Read country count to get ranking

In [108]:
df_npub = pd.read_csv(c_npub_file)
df_npub.head(2)

Unnamed: 0,country,n_pub
0,CHN,60298
1,USA,59503


### Read pdjity table

pmid, date, journal, issn, topic, year

In [11]:
df_pdjity = pd.read_csv(pdjity_file, sep='\t', index_col=0)
df_pdjity.shape

(421307, 5)

In [12]:
df_pdjity.head(2)

Unnamed: 0_level_0,Date,Journal,ISSN,Topic,Year
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
61,1975-12-11,Biochimica et biophysica acta,6300218782434,52,1975
67,1975-11-20,Biochimica et biophysica acta,6300218782434,48,1975


### Add ISSN to df_acdty

In [30]:
pmid_acdty = list(df_acdty.index)
len(pmid_acdty), pmid_acdty[:5], pmid_acdty[-5:]


(330328,
 [400957, 1279107, 1279650, 1280064, 1280162],
 [11277426, 28674549, 29736697, 28307190, 17175550])

In [33]:
issns = df_pdjity.loc[pmid_acdty]['ISSN']

# Spot check
len(issns.values), issns.iloc[:5], issns.iloc[-5:]

(330328,
 PMID
 400957              03050491
 1279107    00221317,14652099
 1279650    07248741,1573904X
 1280064    00039861,10960384
 1280162    00063002,18782434
 Name: ISSN, dtype: object,
 PMID
 11277426             08940282
 28674549             1664462X
 29736697    13403443,18610293
 28307190    00298549,14321939
 17175550    00220957,14602431
 Name: ISSN, dtype: object)

In [34]:
# Add issns to df_acdty
df_acdty['ISSN'] = issns
df_acdty.head(2)

Unnamed: 0_level_0,A3,Confidence,Date,Topic,Year,ISSN
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
400957,CAN,3,1978-01-01,50,1978,3050491
1279107,FRA,3,1992-11-01,12,1992,22131714652099


### Read SJR metric dictionary

A dictionary with year as key, a dictionary as value
- d_d_metric = {year:{ISSN:[4 metrics], ...}, ...}

In [40]:
file_d_d_metric = parent_dir / 'sjr_metric_dicts.pkl'
with open(file_d_d_metric, 'rb') as f:
    d_d_metric = pickle.load(f)

In [97]:
d_d_metric.keys()

dict_keys([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

## ___Country impact___

### Function for getting list of impact values

Modified from 8.1

In [82]:
## Deprecated, issue with list 
def get_m_list(pmid, issn, d_metric):
  
  # first check if issn is np.nan
  if type(issn) == float:
    if not np.isnan(issn):
      print("ERR: float but not nan", issn)
    issn = []
  else:
    issn = issn.split(",")

  # make sure issn, if exist, is in d_metric, then append to a metric list
  m_list = []
  for issn_token in issn:
    if issn_token in d_metric:
      metrics = d_metric[issn_token]
      m_list.append(metrics)

  # check if this journal is found in d_meric
  m_list2 = []
  if m_list != []:
    # get average if multiple issns
    for idx in range(0,4):
      m_sum = 0
      for ms in m_list:
        m_sum += ms[idx]
      m_avg = m_sum / len(m_list)
      m_list2.append(m_avg)

  return m_list, m_list2

In [95]:
def get_m_lst_lst(a3, yr):
  '''Get a list of metric lists for a given country and year
  Args:
    a3 (str): country a3 code, if "", then all countries
    yr (int): year
  Return:
    m_lst_lst (list): [m_lst], where m_lst = [pmid, journal, issn, [Prank, SJR,
      Hidx, Cite]]
    not_found (dict): {journal: [issn, [pmids]]}
  '''
  
  # Get sub-dataframe
  if a3 == "":
    df = df_acdty.loc[df_acdty['Year']==yr]
  else:
    df = df_acdty.loc[(df_acdty['A3']==a3) & (df_acdty['Year']==yr)]

  # pmid and issns
  pmids     = df.index
  issns     = df['ISSN'].values
  d_metric  = d_d_metric[yr] # {issn: [Prank, SJR, Hidx, Cite]}

  m_lst_lst = []  # [m_lst]
  for idx, issn in enumerate(issns):
    pmid    = pmids[idx]

    m_list, m_list2 = get_m_list(pmid, issn, d_metric)

    if m_list != []:
      # need m_list2, but add more info for debugging
      m_lst_lst.append([pmid, issn,m_list2, m_list])

  return m_lst_lst

### Go through each country and year

In [91]:
# Get country list
c_list = list(df_acdty['A3'].unique())
len(c_list)

165

In [98]:
c_y_avg = {} # {country: {year: [Prank, SJR, Hidx, Cite]}}
yr_range = range(1999, 2021)

# For each topic
for a3 in tqdm(c_list):
  c_y_avg[a3] = {}

  # For each year
  for yr in yr_range:
    # [[prank, sjr, hidx, cite]] for all records in a given topic-year
    m_lst_lst = get_m_lst_lst(a3, yr) 
    #print(len(m_lst_lst))
    
    # compile metrics into a 2d array
    m_2d = []
    for m_list in m_lst_lst:
      m_2d.append(m_list[2])
    m_2d  = np.array(m_2d)
    # determine n_pub for each metric after removing NA
    n_pub = np.subtract([m_2d.shape[0]]*4, sum(np.isnan(m_2d)))
    # For a few cases without publication, set to NaN
    if 0 in n_pub:
      c_y_avg[a3][yr] = [np.nan]*4
    else:
      # calculate average and store in dict
      # Issue: RuntimeWarning: invalid value encountered in true_divide
      # https://www.geeksforgeeks.org/how-to-fix-invalid-value-encountered-in-true_divide/
      m_sum = np.nansum(m_2d, axis=0)
      m_avg = np.divide(m_sum, n_pub)
      c_y_avg[a3][yr] = m_avg
    

  0%|          | 0/165 [00:00<?, ?it/s]

100%|██████████| 165/165 [00:47<00:00,  3.51it/s]


In [100]:
c_y_avg['CHN'][2020], c_y_avg['USA'][2020]

(array([  0.79310982,   1.60306774, 177.93186373,   4.74823425]),
 array([  0.84167973,   2.20986465, 207.40929878,   5.10214558]))

### Generate output

Sorted according to:
- df_npub: country - number of pubs
- Decide not to do normalization per country. The point is between country comparisons.

In [112]:
excel_file    = work_dir / 'impact_country.xlsx'
excel_writer  = pd.ExcelWriter(excel_file, engine='xlsxwriter')

metric_names = ['prank', 'sjr', 'hidx', 'cite']
c_order      = df_npub['country'].values
for metric_idx in range(4):
  metric_nm      = metric_names[metric_idx]
  metric_2d      = [] # a 2D list: a3, then year
  metric_2d_norm = [] # a 2D list: a3, then year, normalized for each a3
  for a3 in c_list:
    metric_a3 = []
    for yr in yr_range:
      metric_a3.append(c_y_avg[a3][yr][metric_idx])
    metric_2d.append(metric_a3)

    # do min-max normalization
    #m_min = min(metric_a3)
    #m_max = max(metric_a3)
    #metric_a3_norm = [(m-m_min)/(m_max-m_min) for m in metric_a3]
    #metric_2d_norm.append(metric_a3_norm)
  
  df_metric  = pd.DataFrame(metric_2d, index=c_list, columns=yr_range)
  #df_metric2 = pd.DataFrame(metric_2d_norm, index=c_list, columns=yr_range)

  # sort by the order of the number of publications
  df_metric  = df_metric.reindex(c_order)
  #df_metric2 = df_metric2.reindex(c_order)

  print(metric_nm, df_metric.shape)
  
  df_metric.to_excel(excel_writer, sheet_name=metric_nm)
  #df_metric2.to_excel(excel_writer, sheet_name=metric_nm+"_norm")

excel_writer.close()
  

prank (165, 22)
sjr (165, 22)
hidx (165, 22)
cite (165, 22)


## ___Overall impact___

Realize that I should also calculate the impact per year over all countries to potentially use them as normalizing factors

### Get yearly average impact

In [103]:
y_avg = {} # {year: [Prank, SJR, Hidx, Cite]}

# For each year
for yr in tqdm(yr_range):
  # [[prank, sjr, hidx, cite]] for all records in a given topic-year
  m_lst_lst = get_m_lst_lst("", yr) 
  #print(len(m_lst_lst))
  
  # compile metrics into a 2d array
  m_2d = []
  for m_list in m_lst_lst:
    m_2d.append(m_list[2])
  m_2d  = np.array(m_2d)
  # determine n_pub for each metric after removing NA
  n_pub = np.subtract([m_2d.shape[0]]*4, sum(np.isnan(m_2d)))
  # For a few cases without publication, set to NaN
  if 0 in n_pub:
    y_avg[yr] = [np.nan]*4
  else:
    # calculate average and store in dict
    # Issue: RuntimeWarning: invalid value encountered in true_divide
    # https://www.geeksforgeeks.org/how-to-fix-invalid-value-encountered-in-true_divide/
    m_sum = np.nansum(m_2d, axis=0)
    m_avg = np.divide(m_sum, n_pub)
    y_avg[yr] = m_avg
    

100%|██████████| 22/22 [00:02<00:00,  9.37it/s]


### Generate output

In [105]:
excel_file2    = work_dir / 'impact_overall.xlsx'
excel_writer2  = pd.ExcelWriter(excel_file2, engine='xlsxwriter')

metric_names = ['prank', 'sjr', 'hidx', 'cite']
metric_2d    = [] # a 2D list: year, then metrics
for yr in yr_range:
  metric_2d.append(y_avg[yr])

df_metric = pd.DataFrame(metric_2d, index=yr_range, columns=metric_names)
df_metric.to_excel(excel_writer2, sheet_name="yearly overall")

excel_writer2.close()

## ___Test___

### Deprecated functions

In [None]:
# Replaced
def get_m_lst_lst_v1(a3, yr):
  '''Get a list of metric lists for a given country and year
  Args:
    a3 (str): country a3 code
    yr (int): year
  Return:
    m_lst_lst (list): [m_lst], where m_lst = [pmid, journal, issn, [Prank, SJR,
      Hidx, Cite]]
    not_found (dict): {journal: [issn, [pmids]]}
  '''
  
  df        = df_acdty.loc[(df_acdty['A3']==a3) & (df_acdty['Year']==yr)]
  pmids     = df.index
  issns     = df['ISSN'].values
  
  d_metric  = d_d_metric[yr] # {issn: [Prank, SJR, Hidx, Cite]}
  #not_found = {}             # {journal: [issn, [pmids]}
  m_lst_lst = []  # [m_lst]
  for idx, issn in enumerate(issns):
    pmid    = pmids[idx]

    # first check if issn is np.nan
    if type(issn) == float:
      if not np.isnan(issn):
        print("ERR: float but not nan", issn)
      issn = []
    else:
      issn = issn.split(",")

    # make sure issn, if exist, is in d_metric, then append to a metric list
    m_list = []
    for issn_token in issn:
      if issn_token in d_metric:
        metrics = d_metric[issn_token]
        m_list.append(metrics)

    # check if this journal is found in d_meric
    m_list2 = [] # for getting average if multiple issns
    if m_list != []:
      for idx in range(0,4):
        m_sum = 0
        for ms in m_list:
          m_sum += ms[idx]
        m_avg = m_sum / len(m_list)
        m_list2.append(m_avg)

      # need m_list2, but add more info for debugging
      m_lst_lst.append([pmid, issn,m_list2, m_list])

  return m_lst_lst

In [None]:
# Replaced
def get_m_lst_lst_v1(a3, yr):
  '''Get a list of metric lists for a given country and year
  Args:
    a3 (str): country a3 code
    yr (int): year
  Return:
    m_lst_lst (list): [m_lst], where m_lst = [pmid, journal, issn, [Prank, SJR,
      Hidx, Cite]]
    not_found (dict): {journal: [issn, [pmids]]}
  '''
  
  df        = df_acdty.loc[(df_acdty['A3']==a3) & (df_acdty['Year']==yr)]
  pmids     = df.index
  issns     = df['ISSN'].values
  
  d_metric  = d_d_metric[yr] # {issn: [Prank, SJR, Hidx, Cite]}
  #not_found = {}             # {journal: [issn, [pmids]}
  m_lst_lst = []  # [m_lst]
  for idx, issn in enumerate(issns):
    pmid    = pmids[idx]

    # first check if issn is np.nan
    if type(issn) == float:
      if not np.isnan(issn):
        print("ERR: float but not nan", issn)
      issn = []
    else:
      issn = issn.split(",")

    # make sure issn, if exist, is in d_metric, then append to a metric list
    m_list = []
    for issn_token in issn:
      if issn_token in d_metric:
        metrics = d_metric[issn_token]
        m_list.append(metrics)

    # check if this journal is found in d_meric
    m_list2 = [] # for getting average if multiple issns
    if m_list != []:
      for idx in range(0,4):
        m_sum = 0
        for ms in m_list:
          m_sum += ms[idx]
        m_avg = m_sum / len(m_list)
        m_list2.append(m_avg)

      # need m_list2, but add more info for debugging
      m_lst_lst.append([pmid, issn,m_list2, m_list])

  return m_lst_lst

In [None]:
# Replaced
def get_m_lst_lst_v1(a3, yr):
  '''Get a list of metric lists for a given country and year
  Args:
    a3 (str): country a3 code
    yr (int): year
  Return:
    m_lst_lst (list): [m_lst], where m_lst = [pmid, journal, issn, [Prank, SJR,
      Hidx, Cite]]
    not_found (dict): {journal: [issn, [pmids]]}
  '''
  
  df        = df_acdty.loc[(df_acdty['A3']==a3) & (df_acdty['Year']==yr)]
  pmids     = df.index
  issns     = df['ISSN'].values
  
  d_metric  = d_d_metric[yr] # {issn: [Prank, SJR, Hidx, Cite]}
  #not_found = {}             # {journal: [issn, [pmids]}
  m_lst_lst = []  # [m_lst]
  for idx, issn in enumerate(issns):
    pmid    = pmids[idx]

    # first check if issn is np.nan
    if type(issn) == float:
      if not np.isnan(issn):
        print("ERR: float but not nan", issn)
      issn = []
    else:
      issn = issn.split(",")

    # make sure issn, if exist, is in d_metric, then append to a metric list
    m_list = []
    for issn_token in issn:
      if issn_token in d_metric:
        metrics = d_metric[issn_token]
        m_list.append(metrics)

    # check if this journal is found in d_meric
    m_list2 = [] # for getting average if multiple issns
    if m_list != []:
      for idx in range(0,4):
        m_sum = 0
        for ms in m_list:
          m_sum += ms[idx]
        m_avg = m_sum / len(m_list)
        m_list2.append(m_avg)

      # need m_list2, but add more info for debugging
      m_lst_lst.append([pmid, issn,m_list2, m_list])

  return m_lst_lst