In [59]:
import pickle, multiprocessing
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from scipy.sparse import csr_matrix, vstack

def get_match_csr(txt):
  print(txt.shape)
  with multiprocessing.Pool(processes=15) as pool:
    results_ncbi_list = list(tqdm(pool.imap(task, enumerate(txt), chunksize=1), 
                                  total=len(txt)))

  row_idx   = []
  col_idx   = []
  csr_val   = []
  for row, results_ncbi in enumerate(results_ncbi_list):
    non0_idx = np.nonzero(results_ncbi)[0].tolist()
    row_idx.extend([row]*len(non0_idx))
    col_idx.extend(non0_idx)
    csr_val.extend([1]*len(non0_idx))

  # create a sparse matrix with shape=(num_docs, num_names)
  match_csr = csr_matrix((csr_val, (row_idx, col_idx)),
                         shape=(txt.shape[0], len(offspring_names)), 
                         dtype=np.int0)

  return match_csr

def task(item):
  '''Task to parallelize
  Args:
    item (tuple): (row_number, doc)
  Return:
    results_ncbi (list): an offspring_name is present in the doc (1) or not(1)
  '''
  (row, doc) = item

  # pad the doc so if qualified name is at the beginning or end will still match
  doc = f" {doc} " 
  # Get the matching common names as a list
  # Get lower case because the common name can be the 1st word.
  doc_lower = doc.lower()
  results_usda = [name for name in common_names if(f" {name} " in doc_lower)]

  # Add the results to doc
  for cname in results_usda:  # for each common name
    genus = cnames[cname][0]  # get the genus name
    doc += f" {genus}"        # add the genus name to doc
  
  # Match to NCBI names
  results_ncbi = [1 if(f" {name} " in doc) else 0 for name in offspring_names]

  return results_ncbi

In [47]:
#---------------
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "5_species_over_time/"

print("Read saved objects...")
txt_clean  = pd.read_csv(work_dir / "txt_clean.csv", index_col=0)

with open(work_dir / "viridiplantae_offspring_names.pickle", "rb") as f:
  offspring_names = pickle.load(f)

# Save as pickle
with open(work_dir / "usda_common_names.pickle", "rb") as f:
  common_names = pickle.load(f)

with open(work_dir / "usda_common_names_dict.pickle", "rb") as f:
  cnames = pickle.load(f)

Read saved objects...


In [60]:
print("Get match_csr...")

# Send a subset docs at a time so no memtory issue
n_subset = 50000
csr_list = []
for idx in range(0, txt_clean.shape[0], n_subset):
  print(f" [{idx}, {idx+n_subset})")
  # get subset of docs
  txt        = txt_clean['txt_clean'][idx:(idx+n_subset)]
  # get csr
  match_csr  = get_match_csr(txt)
  csr_list.append(match_csr)

# stack csr
match_csr_all = vstack(csr_list)
print("Final csr:", match_csr_all.shape)

with open(work_dir / "match_csr.pickle", "wb") as f:
  pickle.dump(match_csr_all, f)

Get match_csr...
 [0, 50000)
(50000,)


100%|██████████| 50000/50000 [03:20<00:00, 248.90it/s]


 [50000, 100000)
(50000,)


100%|██████████| 50000/50000 [03:19<00:00, 250.65it/s]


 [100000, 150000)
(50000,)


100%|██████████| 50000/50000 [03:16<00:00, 254.54it/s]


 [150000, 200000)
(50000,)


100%|██████████| 50000/50000 [03:29<00:00, 238.28it/s]


 [200000, 250000)
(50000,)


100%|██████████| 50000/50000 [03:35<00:00, 232.47it/s]


 [250000, 300000)
(50000,)


100%|██████████| 50000/50000 [03:39<00:00, 228.21it/s]


 [300000, 350000)
(50000,)


100%|██████████| 50000/50000 [03:40<00:00, 226.46it/s]


 [350000, 400000)
(50000,)


100%|██████████| 50000/50000 [03:50<00:00, 216.84it/s]


 [400000, 450000)
(21658,)


100%|██████████| 21658/21658 [01:38<00:00, 220.87it/s]


Final csr: (421658, 26782)


In [49]:
def spot_check(idx):
  txt = txt_clean.iloc[idx][0]
  print(txt)
  
  taxa_idx = match_csr_all[idx].nonzero()[1]
  print(taxa_idx)
  for idx in taxa_idx:
    name = offspring_names[idx]
    print(f" {name} in txt:", name in txt)

In [61]:
spot_check(2)

Fructose 16bisphosphate aldolase activity Rhizobium specie FDP aldolase found present cellfree extract Rhizobium leguminosarum Rhizobium phaseoli Rhizobium trifolii Rhizobium meliloti Rhizobium lupini Rhizobium japonicum Rhizobium specie Arachis hypogaea Sesbania cannabina The enzyme 3 representative specie optimal activity pH 84 02M veronal buffer The enzyme activity completely lost treatment 60 degree C 15 min The Km value range 238 455 X 106M FDP Metal chelating agent inhibited enzyme activity monovalent bivalent metal ion failed stimulate activity Bivalent metal ion general rather inhibitory
[21477 21986]
 Sesbania in txt: True
 Arachis in txt: True


In [51]:
# The in txt test will be false because the common name is in the txt.
spot_check(4) 

Reconstitution ion transport respiratory control vesicle formed reduced coenzyme Qcytochrome c reductase phospholipid Reduced coenzyme Qcytochrome c reductase bovine heart mitochondrion complex III incorporated phospholipid vesicle cholate dialysis procedure Soybean phospholipid mixture purified phosphatidylcholine phosphatidylethanolamine cardiolipin could used Oxidation reduced coenzyme Q2 reconstituted vesicle cytochrome c oxidant showed following energycoupling phenomenon 1 Protons translocated outward coupling ratio H2e 19 02 Measurements mitochondrion similar condition showed H2e ratio 18 Proton translocation seen presence uncoupling agent addition net acidification medium overall oxidation reaction 2 Potassium ion taken reconstituted vesicle presence valinomycin reaction coupled electron transfer The coupling ratio K uptake K2e 20 vesicle approximately 15 mitochondrion 3 The rate oxidation reduced coenzyme Q2 reconstituted vesicle stimulated 10fold uncouplers valinomycin plus ni

In [52]:
# This is a false positive. Some Matthiola has a common name: stock.
spot_check(5) 

Effect inorganic phosphate acridine inhibition plasmid curing Escherichia coli Some mutant stock strain Escherichia coli K12 sensitive acriflavine presence inorganic phosphate resistant acriflavine absence They mutated spontaneously resistance acriflavine plus phosphate The synergistic effect phosphate acriflavine sensitivity increased high pH value Genetic analysis suggested mutation occurred gene acrA Electron microscopic observation suggested presence acriflavine plus phosphate affected structure plasma membrane cytoplasm This structural alteration caused acriflavine alone Acridine orange plus phosphate effectively eliminate plasmid F8gal acridine orange alone
[22538]
 Matthiola in txt: False


In [62]:
# Armoracia is horseradish.
spot_check(10) 

Oxidaseperoxidase enzyme Datura innoxia Oxidation formylphenylacetic acid ethyl ester An enzyme system Datura innoxia root oxidizing formylphenylacetic acid ethyl ester purified 38fold conventional method NH42SO4 fractionation negative adsorption alumina Cy gel chromatography DEAEcellulose The purified enzyme shown catalyse stoicheiometric oxidation formylphenylacetic acid ethyl ester benzoylformic acid ethyl ester formic acid utilizing molecular O2 Substrate analogue phenylacetaldehyde phenylpyruvate oxidized low rate formylphenylacetonitrile inhilating agent cyanide thiol compound ascorbic acid This enzyme identical oxidaseperoxidase isoenzyme Another oxidaseperoxidase isoenzyme separated DEAEchromatography also showed formylphenylacetic acid ethyl ester oxidase activity albeit lesser extent The property two isoenzymes oxidase compared shown differ oxidation peroxidation property The oxidation formylphenylacetic acid ethyl ester also catalysed horseradish peroxidase The Datura isoenz

In [63]:
spot_check(400042) 

Oxygen life form evolution sex multicellular eukaryote The evolutionary advantage different sexual system multicellular eukaryote still well understood differentiation male female individual half offspring production compared asexuality Here propose various physiological adaptation oxidative stress could forged sessility versus motility consequently evolution sexual system multicellular animal plant fungi Photosynthesis cause substantial amount oxidative stress photoautotrophic plant likewise oxidative chemistry polymer breakdown cellulose lignin saprotrophic fungi In case extent precludes motility additional source oxidative stress Sessile life form lack neuronal system however limit option mate recognition adult sexual selection resulting inefficient matesearching system Hence sessility requires individual produce offspring achieved hermaphroditism plant andor multiple mating type fungi In animal motility requires neuronal system muscle activity highly sensitive oxidative damage As c

In [64]:
spot_check(11245) 

cDNA sequence homology 57kDa nucleotidebinding subunit vacuolar ATPase Arabidopsis Functional structural similarity among wide variety endomembrane HATPases suggest form distinct class common origin Immunological study Manolson M F Percy J M Apps D K Xie X S Stone D K Poole R J 1987 Proceedings Membrane Protein Symposium Goheen S C ed pp 427434 BioRad Richmond CA M F Manolson J M Percy D K Apps X S Xie D K Stone M Harrison D J Clarke R J Poole unpublished data support idea suggest evolutionary relationship endomembrane F0F1 ATPases Further examination relationship necessitates comparison proteinnucleic acid sequence data To end cloned sequenced cDNA encoding 57kDa polypeptide Arabidopsis vacuolar membrane HATPase To knowledge first report sequence 57kDa subunit plant animal endomembrane HATPase This cDNA encodes hydrophilic polypeptide containing putative ATP binding site Lack secretion signal sequence suggests processed endoplasmic reticulum translated cytosolic ribosome Comparison pr