## ___Set up___

In [9]:
import pickle, umap
import numpy as np
import pandas as pd
#import umap.plot
import matplotlib.pyplot as plt
from pathlib import Path
from matplotlib import colors

In [3]:
proj_dir     = Path.home() / "projects/plant_sci_hist"

dir42        = proj_dir / "4_topic_model/4_2_outlier_assign"
prob_file    = dir42 / "probs.pickle"

# Check which of the two correspond to the prob file because the plotting is 
# done based on the topic assignments using the prob files. Need to make sure 
# the number of records are consistent between prob file and the corpus file.
corpus_file1 = dir42 / "table4_2_corpus_with_topic_assignment.tsv.gz"
corpus_file2 = dir42 / "table7_5_corpus_with_topic_assignment_nodup.tsv.gz"

dir46       = proj_dir / "4_topic_model/4_6_umap"
mapper_file = dir46 / '_umap_nn40/mapper_topics_all.pickle'

work_dir    = proj_dir / "9_wrap_up/9_4_umap"
work_dir.mkdir(exist_ok=True)

mapper_dir = work_dir / "_umap_nn40_year"

## ___Get the year array___

In [5]:
# first check the size of prob, corpus file 1 and corpus file 2
with open(prob_file, 'rb') as f:
    probs = pickle.load(f)

corpus1 = pd.read_csv(corpus_file1, sep='\t', compression='gzip')
corpus2 = pd.read_csv(corpus_file2, sep='\t', compression='gzip')

probs.shape, corpus1.shape, corpus2.shape

((421658, 90), (421658, 13), (421307, 13))

In [6]:
# Ok, the prob is based on the original with some duplicates
# Also, check 4_6 and the topical assignment is consistent
corpus1.head()

Unnamed: 0.1,Unnamed: 0,Index_1385417,PMID,Date,Journal,Title,Abstract,Initial filter qualifier,Corpus,reg_article,Text classification score,Preprocessed corpus,Topic
0,0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,identification 120 mus phase decay delayed flu...,52
1,1,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,cholinesterases plant tissues . vi . prelimina...,48
2,2,9,283,1975-01-01,Folia microbiologica,"Fructose 1,6-bisphosphate aldolase activity of...",FDP aldolase was found to be present in the ce...,sesbania,"Fructose 1,6-bisphosphate aldolase activity of...",1,0.90107,"fructose 1,6-bisphosphate aldolase activity rh...",48
3,3,14,380,1975-07-01,Journal of biochemistry,Studies on trypsin inhibitor in barley. I. Pur...,To clarify the properties and functions of a t...,barley,Studies on trypsin inhibitor in barley. I. Pur...,1,0.894069,studies trypsin inhibitor barley . i. purifica...,48
4,4,17,385,1975-11-10,The Journal of biological chemistry,Reconstitution of ion transport and respirator...,Reduced coenzyme Q-cytochrome c reductase from...,soybean,Reconstitution of ion transport and respirator...,1,0.827328,reconstitution ion transport respiratory contr...,47


In [7]:
yr_array = np.array([int(x.split("-")[0]) for x in corpus1["Date"]])

yr_array[:5]

array([1975, 1975, 1975, 1975, 1975])

## ___Read mapper obj___

In [11]:
with open(mapper_file, "rb") as f:
  mapper = pickle.load(f)

TypeError: code() argument 13 must be str, not int

## ___Plotting functions___

Lifted from 4_6.

In [None]:
# get colors
#To get color bins:
#https://stackoverflow.com/questions/69085926/have-each-histogram-bin-with-a-different-color
#Matplotlib color map
#https://matplotlib.org/stable/gallery/color/colormap_reference.html 
#RGB and RGBA
#https://matplotlib.org/stable/tutorials/colors/colors.html
# rgb or rgba won't work for color_keys later, need hex color
#https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.colors.to_hex.html#matplotlib.colors.to_hex
def get_ckeys():
  cm = plt.cm.rainbow
  #cm = plt.cm.turbo
  #cm = plt.cm.hsv
  ckeys = {i:colors.to_hex(cm(i/91)) for i in range(-1,91)}
  return ckeys

In [None]:
def umap_plot_each(mapper_dir, mapper, labels, year, c_fg, c_bg):
  '''Do umap plot for each year, lifted from 4.6.
  Args:
    mapper_dir (Path): directory where the mapper object is and where the plot 
      will be saved
    mapper (umap.umap_.UMAP): mapper object
    labels (list): list of labels for the mapping (1 for the topic of interest,
      0 for all other topics)
    topic (int): topic info to be included in the title
    c_fg (str): color for the topic
    c_bg (str): color for all other topics
  '''
  
  color_key={f"year={year}":c_fg, "all_others":c_bg}

  umap.plot.points(mapper, labels=labels, color_key=color_key)
  mapper_year_plot = mapper_dir / f'fig_9_4_mapper_year_{year}.pdf'
  plt.title(f'Year {year}')
  plt.savefig(mapper_year_plot)
  plt.close()

In [None]:
def plot_each(mapper_dir, mapper, yr_array, ckeys, c_fg="", c_bg="lightgray",
              test_plot=0):
  ''' Go through years
  Args:
    mapper_dir (Path): directory of mapper object
    mapper (UMAP): mapper object
    yr_array (Numpy array): int array with years
    ckeys (array): array of colors
    c_fg (str): foreground color, if not set, all topic colors will be the same
    c_bg (str): background color, default lightgray
  '''
  c = 0
  for year in range(-1,90):

    yr_labels = []

    if c_fg == "":
      c_fg = ckeys[year+1]

    # Modify labels
    for label in yr_array:
      if label == year:
        yr_labels.append(f"year={year}")
      else:
        yr_labels.append("all_others")

    # Change label list into an array for umap.plot
    labels_array =  np.array(yr_labels)

    # Plotting
    umap_plot_each(mapper_dir, mapper, labels_array, year, c_fg, c_bg)
    
    if test_plot and c == 5:
      break
    c += 1

## ___Plotting each topic___

In [10]:
ckeys = get_ckeys()

plot_each(mapper_dir, mapper_def, yr_array, ckeys, c_fg="red", c_bg="lightgray",
          test_plot=1)

NameError: name 'get_ckeys' is not defined