# __Minor revision__

In [9]:
import os
from pathlib import Path
import pandas as pd

In [53]:
proj_dir = Path.home() / "projects/plant_sci_hist/"

dir4  = proj_dir / "4_topic_model"
dir42 = dir4 / "4_2_outlier_assign"
dir43 = dir4 / "4_3_model_analysis"

dir2  = proj_dir / "2_text_classify"
dir25 = dir2 / "2_5_predict_pubmed"

dir95 = proj_dir / "9_wrap_up/9_5_minor_rev"
dir95.mkdir(exist_ok=True)

## ___Recover Figure 1B info___

The original BERTopic model could not be read back in due to version change. However, the topic assignment info is already incorported into the corpus file. So the counts are recovered from there.

In [10]:
# Read corpus file with topic assignment
corpus_file = dir42 / "table4_2_corpus_with_topic_assignment.tsv.gz"
corpus_df   = pd.read_csv(corpus_file, sep="\t", compression="gzip")

In [11]:
corpus_df.head(2)

Unnamed: 0.1,Unnamed: 0,Index_1385417,PMID,Date,Journal,Title,Abstract,Initial filter qualifier,Corpus,reg_article,Text classification score,Preprocessed corpus,Topic
0,0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,identification 120 mus phase decay delayed flu...,52
1,1,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,cholinesterases plant tissues . vi . prelimina...,48


In [43]:
# Get topic counts
topic_counts = corpus_df["Topic"].value_counts().sort_index()
topic_counts = pd.DataFrame(topic_counts)
topic_counts.columns = ["Count"]
topic_counts.index.name = "Topic"

topic_counts.head(10)

Unnamed: 0_level_0,Count
Topic,Unnamed: 1_level_1
-1,49228
0,895
1,2917
2,1098
3,751
4,4616
5,620
6,1977
7,1344
8,1057


In [45]:
# Get topic names
topic_name_file = dir43 / "fig4_3_topic_heatmap_seaborn_order.txt"

topic_name = pd.read_csv(topic_name_file, sep="\t", header=None,
                        names=["Topic", "Name"], index_col="Topic")

# sort topic_name based on index
topic_name = topic_name.reindex(topic_counts.index)

topic_name.head(10)

Unnamed: 0_level_0,Name
Topic,Unnamed: 1_level_1
-1,|plant|plants|genes|cell|expression|gene|prote...
0,|allergen|allergens|pollen|ige|patients|bet|al...
1,|medium|callus|regeneration|mgl|ms|culture|som...
2,|dots|fluorescence|detection|carbon dots|carbo...
3,|glyphosate|resistance|herbicide|herbicides|re...
4,|uvb|stress|plants|radiation|leaves|increased|...
5,|bp|chloroplast genome|complete chloroplast|ge...
6,|cell|cells|imaging|microscopy|plant|proteins|...
7,|cells|cell|wall|pollen|cellulose|gravity|cyto...
8,|genome sequence|genome|draft|draft genome|str...


In [46]:
# merge topic_counts and topic_name
topic_count_name = pd.merge(topic_counts, topic_name, left_index=True, right_index=True)

In [48]:
topic_count_name.head(10)

Unnamed: 0_level_0,Count,Name
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,49228,|plant|plants|genes|cell|expression|gene|prote...
0,895,|allergen|allergens|pollen|ige|patients|bet|al...
1,2917,|medium|callus|regeneration|mgl|ms|culture|som...
2,1098,|dots|fluorescence|detection|carbon dots|carbo...
3,751,|glyphosate|resistance|herbicide|herbicides|re...
4,4616,|uvb|stress|plants|radiation|leaves|increased|...
5,620,|bp|chloroplast genome|complete chloroplast|ge...
6,1977,|cell|cells|imaging|microscopy|plant|proteins|...
7,1344,|cells|cell|wall|pollen|cellulose|gravity|cyto...
8,1057,|genome sequence|genome|draft|draft genome|str...


In [52]:
# Save dataframe as csv
topic_count_name_file = dir95 / "fig1b_topic_count_name.csv"
topic_count_name.to_csv(topic_count_name_file)

## ___Figure S1___

### S1D

In [56]:
pubmed_pred_prob_file = \
  dir25 / "pubmed_qual_1384718_w2v_pred_prob_CORRECTED.tsv.gz"
pubmed_pred_prob_df = \
  pd.read_csv(pubmed_pred_prob_file, sep="\t", compression="gzip")
pubmed_pred_prob_df.head()

Unnamed: 0.1,Unnamed: 0,PMID,Date,Journal,Title,Abstract,QualifiedName,txt,reg_article,y_prob,y_pred
0,0,36,1975-11-01,The British journal of nutrition,The effects of processing of barley-based supp...,1. In one experiment the effect on rumen pH of...,barley,The effects of processing of barley-based supp...,1,0.16214,0
1,1,52,1975-12-02,Biochemistry,Evidence of the involvement of a 50S ribosomal...,The functional role of the Bacillus stearother...,rose,Evidence of the involvement of a 50S ribosomal...,1,0.286834,0
2,2,60,1975-12-11,Biochimica et biophysica acta,The reaction between the superoxide anion radi...,1. The superoxide anion radical (O2-) reacts w...,tuna,The reaction between the superoxide anion radi...,1,0.248357,0
3,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,1
4,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,1


In [58]:
# Get pmid, y_prob from pubmed_pred_prob_df
pubmed_pred_prob_df_select = pubmed_pred_prob_df[["PMID", "y_prob"]]
pubmed_pred_prob_df_select.head()

Unnamed: 0,PMID,y_prob
0,36,0.16214
1,52,0.286834
2,60,0.248357
3,61,0.716394
4,67,0.894874


In [59]:
pubmed_pred_prob_df_select.to_csv(
  dir95 / "fig_s1d_pubmed_pred_prob.csv", index=False)

### S1A, B, and C