# __Step 5.3: Speices, topic, and time__

Questions:
- What species tend to be worked on in a topic?
- Are there different focal species for a topic over time?

Goals here:
- Species over/under-represented in a topic
- species over/under-represented in a topic/time bin

## ___Set up___

### Module import

In [5]:
import pickle, nltk, re, multiprocessing
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
from zipfile import ZipFile
from tqdm import tqdm
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix, dok_matrix
from time import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import OrderedDict, Counter

### Key variables

In [17]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "5_species_over_time/5_3_sp_topic_time"
work_dir.mkdir(parents=True, exist_ok=True)

# topic assignment
dir42             = proj_dir / "4_topic_model/4_2_outlier_assign"
file_topic_assign = dir42 / "table4_2_corpus_with_topic_assignment.tsv.gz"

# topic name
dir44             = proj_dir / "4_topic_model/4_4_over_time"
file_topic_name   = dir44 / "fig4_4_tot_heatmap_weighted_xscaled_names.txt"

# species-time analysis folder
dir51 = proj_dir / "5_species_over_time/5_1_sp_time"
# taxa count sparse matrices and corresponding taxa names
file_csr_fam      = dir51 / "match_csr_family.pickle"
file_csr_fam_nm   = dir51 / "match_csr_family_names.pickle"
file_csr_genus    = dir51 / "match_csr_genus.pickle"
file_csr_genus_nm = dir51 / "match_csr_genus_names.pickle"
# taxa count time series
file_ts_genus     = dir51 / "Table5_1_ts_genusALL_count.txt"
#file_ts_fam       = dir51 / "Table5_1_ts_familyALL_count.txt"

# So PDF is saved in a format properly
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams["font.family"] = "sans-serif"

## ___Process topic data___

### Read topic assignment

In [23]:
#https://stackoverflow.com/questions/35101093/load-directly-gz-file-into-pandas-dataframe
#https://www.delftstack.com/howto/python-pandas/pandas-read-gz-file/
#https://stackoverflow.com/questions/36519086/how-to-get-rid-of-unnamed-0-column-in-a-pandas-dataframe-read-in-from-csv-fil

# topic data-frame
tdf = pd.read_csv(file_topic_assign, sep='\t', compression='gzip', index_col=[0])
tdf.shape

(421658, 12)

In [19]:
tdf.head(1)

Unnamed: 0,Index_1385417,PMID,Date,Journal,Title,Abstract,Initial filter qualifier,Corpus,reg_article,Text classification score,Preprocessed corpus,Topic
0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,identification 120 mus phase decay delayed flu...,52


In [28]:
toc_array = tdf['Topic'].values
type(toc_array), toc_array.shape

(numpy.ndarray, (421658,))

### Read topic names

In [22]:
toc_names = pd.read_csv(file_topic_name, sep='\t')
toc_names.head(2)

Unnamed: 0,Topic,Mod_name
0,22,enzyme | fatty acids | lipid | synthesis
1,18,protein | dna | rna | synthesis | mrna


## ___Species representation among topics___

### Genus level

In [26]:
with open(file_csr_genus, "rb") as f:
  csr_genus = pickle.load(f)

with open(file_csr_genus_nm, "rb") as f:
  csr_genus_nm = pickle.load(f)

csr_genus.shape, len(csr_genus_nm)

((421658, 16794), 16794)