# __Minor revision__

In [2]:
import os
from pathlib import Path
import pandas as pd

In [15]:
proj_dir = Path.home() / "projects/plant_sci_hist/"

dir4  = proj_dir / "4_topic_model"
dir42 = dir4 / "4_2_outlier_assign"
dir43 = dir4 / "4_3_model_analysis"

dir2  = proj_dir / "2_text_classify"
dir25 = dir2 / "2_5_predict_pubmed"

dir95 = proj_dir / "9_wrap_up/9_5_minor_rev"
dir95.mkdir(exist_ok=True)

## ___Recover Figure 1B info___

The original BERTopic model could not be read back in due to version change. However, the topic assignment info is already incorported into the corpus file. So the counts are recovered from there.

In [4]:
# Read corpus file with topic assignment
corpus_file = dir42 / "table4_2_corpus_with_topic_assignment.tsv.gz"
corpus_df   = pd.read_csv(corpus_file, sep="\t", compression="gzip")

In [11]:
corpus_df.head(2)

Unnamed: 0.1,Unnamed: 0,Index_1385417,PMID,Date,Journal,Title,Abstract,Initial filter qualifier,Corpus,reg_article,Text classification score,Preprocessed corpus,Topic
0,0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,identification 120 mus phase decay delayed flu...,52
1,1,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,cholinesterases plant tissues . vi . prelimina...,48


In [43]:
# Get topic counts
topic_counts = corpus_df["Topic"].value_counts().sort_index()
topic_counts = pd.DataFrame(topic_counts)
topic_counts.columns = ["Count"]
topic_counts.index.name = "Topic"

topic_counts.head(10)

Unnamed: 0_level_0,Count
Topic,Unnamed: 1_level_1
-1,49228
0,895
1,2917
2,1098
3,751
4,4616
5,620
6,1977
7,1344
8,1057


In [45]:
# Get topic names
topic_name_file = dir43 / "fig4_3_topic_heatmap_seaborn_order.txt"

topic_name = pd.read_csv(topic_name_file, sep="\t", header=None,
                        names=["Topic", "Name"], index_col="Topic")

# sort topic_name based on index
topic_name = topic_name.reindex(topic_counts.index)

topic_name.head(10)

Unnamed: 0_level_0,Name
Topic,Unnamed: 1_level_1
-1,|plant|plants|genes|cell|expression|gene|prote...
0,|allergen|allergens|pollen|ige|patients|bet|al...
1,|medium|callus|regeneration|mgl|ms|culture|som...
2,|dots|fluorescence|detection|carbon dots|carbo...
3,|glyphosate|resistance|herbicide|herbicides|re...
4,|uvb|stress|plants|radiation|leaves|increased|...
5,|bp|chloroplast genome|complete chloroplast|ge...
6,|cell|cells|imaging|microscopy|plant|proteins|...
7,|cells|cell|wall|pollen|cellulose|gravity|cyto...
8,|genome sequence|genome|draft|draft genome|str...


In [46]:
# merge topic_counts and topic_name
topic_count_name = pd.merge(topic_counts, topic_name, left_index=True, right_index=True)

In [48]:
topic_count_name.head(10)

Unnamed: 0_level_0,Count,Name
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,49228,|plant|plants|genes|cell|expression|gene|prote...
0,895,|allergen|allergens|pollen|ige|patients|bet|al...
1,2917,|medium|callus|regeneration|mgl|ms|culture|som...
2,1098,|dots|fluorescence|detection|carbon dots|carbo...
3,751,|glyphosate|resistance|herbicide|herbicides|re...
4,4616,|uvb|stress|plants|radiation|leaves|increased|...
5,620,|bp|chloroplast genome|complete chloroplast|ge...
6,1977,|cell|cells|imaging|microscopy|plant|proteins|...
7,1344,|cells|cell|wall|pollen|cellulose|gravity|cyto...
8,1057,|genome sequence|genome|draft|draft genome|str...


In [52]:
# Save dataframe as csv
topic_count_name_file = dir95 / "fig1b_topic_count_name.csv"
topic_count_name.to_csv(topic_count_name_file)

## ___Figure S1___

### S1D

In [56]:
pubmed_pred_prob_file = \
  dir25 / "pubmed_qual_1384718_w2v_pred_prob_CORRECTED.tsv.gz"
pubmed_pred_prob_df = \
  pd.read_csv(pubmed_pred_prob_file, sep="\t", compression="gzip")
pubmed_pred_prob_df.head()

Unnamed: 0.1,Unnamed: 0,PMID,Date,Journal,Title,Abstract,QualifiedName,txt,reg_article,y_prob,y_pred
0,0,36,1975-11-01,The British journal of nutrition,The effects of processing of barley-based supp...,1. In one experiment the effect on rumen pH of...,barley,The effects of processing of barley-based supp...,1,0.16214,0
1,1,52,1975-12-02,Biochemistry,Evidence of the involvement of a 50S ribosomal...,The functional role of the Bacillus stearother...,rose,Evidence of the involvement of a 50S ribosomal...,1,0.286834,0
2,2,60,1975-12-11,Biochimica et biophysica acta,The reaction between the superoxide anion radi...,1. The superoxide anion radical (O2-) reacts w...,tuna,The reaction between the superoxide anion radi...,1,0.248357,0
3,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,1
4,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,1


In [58]:
# Get pmid, y_prob from pubmed_pred_prob_df
pubmed_pred_prob_df_select = pubmed_pred_prob_df[["PMID", "y_prob"]]
pubmed_pred_prob_df_select.head()

Unnamed: 0,PMID,y_prob
0,36,0.16214
1,52,0.286834
2,60,0.248357
3,61,0.716394
4,67,0.894874


In [59]:
pubmed_pred_prob_df_select.to_csv(
  dir95 / "fig_s1d_pubmed_pred_prob.csv", index=False)

## ___Additional analysis on 12 benchmark journal___

In [29]:
j_positive_nt = [
  'Current opinion in plant biology',
  'Trends in plant science',
  'Functional plant biology : FPB',
  'Molecular plant pathology',
  'Molecular plant',
  'Journal of integrative plant biology',
  'Journal of plant research',
  'Physiology and molecular biology of plants : an international journal of '+\
    'functional plant biology',
  'Nature plants',
  'The plant pathology journal',
  'Annual review of plant biology',
  'The plant genome']

In [18]:
corpus_cand_df_file = dir25 / f"pubmed_qual_1384718_w2v_pred_prob_CORRECTED.tsv.gz"
corpus_cand_df = pd.read_csv(corpus_cand_df_file, sep="\t", compression="gzip")

In [30]:
corpus_cand_df.shape

(1384717, 11)

In [31]:
# Get records with journal names in j_positive_nt
corpus_df_j12 = corpus_cand_df[corpus_cand_df["Journal"].isin(j_positive_nt)]

corpus_df_j12.shape

(12611, 11)

In [34]:
import numpy as np

corpus_df_j12_pred_false = corpus_df_j12["y_pred"] == 0
num_fn = np.sum(corpus_df_j12_pred_false)

tpr = 1 - num_fn / corpus_df_j12.shape[0]
tpr

0.902862580287051

## ___Additional analysis on our own paper___

In [38]:
pmid_shiu_sh = \
[11526204,
11752632,
12169662,
12805585,
14657406,
14963097,
15105442,
15653807,
16166257,
16461903,
17010199,
17091126,
17098053,
17189345,
17284581,
17395691,
17466346,
17530419,
17555994,
17652332,
17720868,
18079367,
22303237,
18369451,
18715958,
19136648,
19321712,
19641029,
19649161,
20008477,
20152032,
20935180,
22629272,
21297981,
21511909,
21549954,
21849619,
22025705,
22072962,
22383541,
22443345,
22496844,
22889912,
23132786,
23166516,
23431200,
24082131,
24306534,
24876251,
24903334,
25354782,
25384563,
25880851,
25918418,
25986129,
26063739,
26103993,
26216534,
26286535,
26291518,
26586835,
26826726,
27098848,
27288366,
27522016,
27935950,
28295310,
28373393,
28398576,
28542203,
28761000,
29288234,
29554332,
29743197,
30239695,
30674669,
30718437,
30866803,
31431676,
31533955,
31551359,
31641024,
31937681,
32054475,
32170020,
32396837,
32613943,
32817425,
33344884,
33530937,
33575601,
33681966,
33749860,
33871641,
34354723,
34716496,
34865154,
35078130,
35218008,
35832206,
36001691,
36156105,
36299783,
37126718,
37918732]

In [39]:
len(pmid_shiu_sh)

105

In [41]:
# get records with pmid in pmid_shiu_sh
corpus_df_shiu_sh = corpus_cand_df[corpus_cand_df["PMID"].isin(pmid_shiu_sh)]
corpus_df_shiu_sh.shape

(70, 11)

In [44]:
# remove unnamed: 0 column
corpus_df_shiu_sh = corpus_df_shiu_sh.loc[:, ~corpus_df_shiu_sh.columns.str.contains('Unnamed: 0')]

corpus_df_shiu_sh.to_csv(dir95 / "corpus_df_shiu_sh.csv", index=False)

## ___Reviewer suggested analysis___

Reviewer 1 suggested we:
- Randomly select 50 predicted positives and 50 predicted negatives,
- Provde only titles and abstracts,
- Have other colleagues to annotate these records.

In [45]:
# randomly select 50 positive and negative records from corpus_cand_df
corpus_df_pos = corpus_cand_df[corpus_cand_df["y_pred"] == 1]
corpus_df_pos_sample = corpus_df_pos.sample(n=50, random_state=240330)
corpus_df_pos_sample = corpus_df_pos_sample[["PMID", "Title", "Abstract", "y_prob"]]
corpus_df_pos_sample.to_csv(dir95 / "corpus_df_pos_sample.csv", index=False)

corpus_df_pos_sample_for_annotator = corpus_df_pos_sample[["Title", "Abstract"]]
corpus_df_pos_sample_for_annotator.to_csv(dir95 / "corpus_df_pos_sample_for_annotator.csv", index=False)

corpus_df_neg = corpus_cand_df[corpus_cand_df["y_pred"] == 0]
corpus_df_neg_sample = corpus_df_neg.sample(n=50, random_state=240330)
corpus_df_neg_sample = corpus_df_neg_sample[["PMID", "Title", "Abstract", "y_prob"]]
corpus_df_neg_sample.to_csv(dir95 / "corpus_df_neg_sample.csv", index=False)

corpus_df_neg_sample_for_annotator = corpus_df_neg_sample[["Title", "Abstract"]]
corpus_df_neg_sample_for_annotator.to_csv(dir95 / "corpus_df_neg_sample_for_annotator.csv", index=False)

In [46]:
corpus_df_pos_sample

Unnamed: 0.1,Unnamed: 0,PMID,Date,Journal,Title,Abstract,QualifiedName,txt,reg_article,y_prob,y_pred
944456,1027528,26379678,2015-09-18,Frontiers in plant science,Physio-biochemical and morphological character...,"Acacia ampliceps (salt wattle), a leguminous s...",acacia,Physio-biochemical and morphological character...,1,0.941303,1
673928,751582,21642137,2006-05-01,American journal of botany,Inbreeding effect on male and female fertility...,Models of the evolution of gynodioecy assume t...,plant,Inbreeding effect on male and female fertility...,1,0.810928,1
1019550,1104213,27733112,2016-10-14,BMC plant biology,Transcriptomic comparison between two Vitis vi...,Predicted climate changes announce an increase...,plants,Transcriptomic comparison between two Vitis vi...,1,0.877239,1
1153083,1241192,30007612,2018-07-17,Carbohydrate polymers,"Molecular interactions between 3,4-dihydroxyph...",This study explored the interaction of pectin ...,olive,"Molecular interactions between 3,4-dihydroxyph...",1,0.66929,1
449867,515287,16667194,1989-12-01,Plant physiology,Proteolytic Activity at Alkaline pH in Oat Lea...,Proteolytic activity in oat leaf extracts was ...,oat,Proteolytic Activity at Alkaline pH in Oat Lea...,1,0.846221,1
1074530,1160472,28668976,2017-07-03,Planta,Abscisic acid-regulated protein degradation ca...,Whereas proline accumulates through de novo bi...,plants,Abscisic acid-regulated protein degradation ca...,1,0.993804,1
1081098,1167172,28775794,2017-08-05,Standards in genomic sciences,High-quality genome sequence of the radioresis...,The genetic platforms of Deinococcus species r...,mate,High-quality genome sequence of the radioresis...,1,0.752167,1
944128,1027195,26374125,2015-09-17,mBio,Insights into Substrate Specificity of NlpC/P6...,Bacterial SH3 (SH3b) domains are commonly fuse...,dock,Insights into Substrate Specificity of NlpC/P6...,1,0.738414,1
719521,798138,22504550,2012-04-17,The American naturalist,Mountain pine beetle develops an unprecedented...,The mountain pine beetle (MPB; Dendroctonus po...,mountain pine,Mountain pine beetle develops an unprecedented...,1,0.654604,1
704079,782349,22214659,2012-01-05,The Plant cell,Arabidopsis ubiquitin conjugase UBC32 is an ER...,Plants modify their growth and development to ...,plants,Arabidopsis ubiquitin conjugase UBC32 is an ER...,1,0.965505,1
