# __Step 6__

Get institution over time info:
- Particularly HBCU

## ___Set up___

### Module import

In [None]:
import pickle, re, multiprocessing
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix, dok_matrix
from time import time
from collections import OrderedDict

### Key variables

In [None]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "6_institution_over_time/"
work_dir.mkdir(parents=True, exist_ok=True)

# So PDF is saved in a format properly
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams["font.family"] = "sans-serif"

## ___Functions___

In [None]:
def split_records(pubmed_file):
  count_rec = 0
  with open(pubmed_file, 'r') as f:
    lines   = f.readlines()[1:]
    records = [] # a list of strings, each element is a record
    record  = "" # a temp obj to hold lines of a record
    for line in lines:
      line_stripped = line.strip()
      if line_stripped == "":
        records.append(record) # add to records
        count_rec += 1
        record     = ""            # reset
      else:
        record += line

    # Add the last record
    records.append(record)
    count_rec += 1

    print("Number of records=", len(records))
  
  return records

In [None]:
def get_tag_info(records):
  tags_global = {} # {tag: max_count_of_all_records}
  for record in records:
    tags_local = {} # {tag: max_count_for_this_record}
    rec_lines  = record.split('\n')

    # populate the tags_local dictionary
    for rec_line in rec_lines: 
      if len(rec_line) > 5 and rec_line[4] == "-":
        tag = rec_line[:4]
        if tag not in tags_local:
          tags_local[tag] = 1
        else:
          tags_local[tag]+= 1
    
    # compare tags_local to tags_global
    for tag in tags_local:
      # If tag does not exist or local count is higher than global count
      if tag not in tags_global or tags_global[tag] < tags_local[tag]:
        tags_global[tag] = tags_local[tag]

  return tags_global

In [None]:
def get_rec_dict(records):
  '''Parse a list of records to get FAU (full author), AU, AD (address), TA
    (journal title), PMID.
  Args:
    records (list): a list of strings from split_records
  Return:
    rec_dict (dict): {FAU: [{AU:1}, {AD:1}, {TA:count}, {PMID:count}]} 
  '''

  rec_dict = {} 

  for record in tqdm(records):
    lines    = record.split('\n')
    fau           = "" # temp string for hold fau info
    pmid          = ""
    fau_list      = [] # temp list for holding all fau of a record
    for line in lines:
      if len(line) < 4:
        continue
      
      if line[4] == "-":
        # new tag
        tag = line[:4]       
        val = line[6:] # tag value

        # Put full author, author, address info into dictionary
        # FAU tag
        if tag == "PMID":
          pmid =int(val)
        elif tag == "FAU ":
          fau = val
          fau_list.append(fau)
          if fau not in rec_dict:
            # fau will be the first one encountered, initialize empty dicts
            rec_dict[fau] = [{}, {}, {}, {}]
            # Put pmid info in
            rec_dict[fau][3][pmid] = 1
        # AU tag
        elif tag == "AU  ":
          rec_dict[fau][0][val] = 1
        # AD tag
        elif tag == "AD  ":
          rec_dict[fau][1][val] = 1
        # When getting to TA, FAU/AU/AD infos are already in, so go through all
        # keys and add TA (tltle abbreviated) info
        elif tag == "TA  ":
          #print("TA tag:", val)
          for a_fau in fau_list:
            if val not in rec_dict[a_fau][2]:
              #if a_fau == "Sinha, Neelima R": print("new")
              rec_dict[a_fau][2][val] = 1
            else:
              #if a_fau == "Sinha, Neelima R": print("exist")
              rec_dict[a_fau][2][val]+= 1
      # do not deal with multi line situation
      #else:
      #  # add line to 
      #  val+= test_line[6:]

  return rec_dict

## ___Test case___

In [None]:
pubmed_file = work_dir / "pubmed-test.out"
records     = split_records(pubmed_file)
tags_global = get_tag_info(records)

for tag in tags_global:
  print(tag, tags_global[tag])

In [12]:
# Run on all records
rec_dict = get_rec_dict(records)