# __Step 5: Make predictions__

The best model is Word2Vec:
- Because the performance is all very similar, for interpretability purpose, choose to focus on the Word2Vec-based model with ` [min_count, window, n_gram] = [20, 8, 3]`. 
- This way there is a smaller set of eature (because min_count is high) that include tri-grams (so 3 word combinations that help with interpretation).

Goal
- Make predictions of the entire corupus


## ___Set up___

### Module import

In [23]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import tensorflow as tf
from pathlib import Path
import script_2_3_text_classify_w2v as script23

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting paths
work_dir   = Path.home() / "projects/plant_sci_hist/2_text_classify"

os.chdir(work_dir)

# Training data for interpretation purpose
corpus_train = work_dir / "corpus_train.json"

# The columns to focus on
target_col = 'txt'

# Trainded Word2Vec model, tokenizer, and vocab for getting embeddings
w2v_name   = work_dir / f"model_cln_w2v_20-8-3"
tok_name   = work_dir / f"model_cln_w2v_token_20-8-3"
vocab_name = work_dir / f"model_cln_w2v_vocab_20-8-3"

# Getting ngrams
ngram = 3
min_count = 20

# DNN checkpoint path
cp_filepath = work_dir / f"model_cln_w2v_20-8-3_dnn"

# Corpus to make predictions for
corpus_dir  = Path.home() / "projects/plant_sci_hist/1_obtaining_corpus"
corpus_file = corpus_dir / "pubmed_qualified.tsv"

## ___Analysis of prediction outcome___

### Load w2v model, tokenizer, and vocab

Need:
- W2V model
- Tokenizer and vocab
- Trained DNN model

In [3]:
# Load word2vec model
with open(w2v_name, "rb") as f:
  model_w2v = pickle.load(f)
model_w2v

<gensim.models.word2vec.Word2Vec at 0x7f7acfa60be0>

In [4]:
# Load tokenzier and vocab
with open(tok_name, "rb") as f:
  tokenizer = pickle.load(f)

with open(vocab_name, "rb") as f:
  vocab = pickle.load(f)

### Get training/testing data

In [5]:
# Not needed...
with corpus_train.open("r+") as f:
  corpus_combo_json = json.load(f)

## __Make predictions on the whole dataset__

### Read corpus that needs to be predicted

In [6]:
corpus_df_raw = pd.read_csv(corpus_file, delimiter='\t')

In [7]:
corpus_df_raw.shape

(1497511, 6)

In [8]:
# Drop duplicated rows
corpus_df = corpus_df_raw[corpus_df_raw.duplicated() == False]

# Rid of all records with NAs
corpus_df = corpus_df.dropna(axis=0)

# Create a new column 'txt' which is concatenated between 'Title' and 'Abstract'
corpus_df['txt'] = corpus_df['Title'] + " " + corpus_df['Abstract']
corpus_df.shape

(1385417, 7)

In [9]:
corpus_df.head(3)

Unnamed: 0,PMID,Date,Journal,Title,Abstract,QualifiedName,txt
0,36,1975-11-01,The British journal of nutrition,The effects of processing of barley-based supp...,1. In one experiment the effect on rumen pH of...,barley,The effects of processing of barley-based supp...
1,52,1975-12-02,Biochemistry,Evidence of the involvement of a 50S ribosomal...,The functional role of the Bacillus stearother...,rose,Evidence of the involvement of a 50S ribosomal...
2,60,1975-12-11,Biochimica et biophysica acta,The reaction between the superoxide anion radi...,1. The superoxide anion radical (O2-) reacts w...,tuna,The reaction between the superoxide anion radi...


In [10]:
corpus_df['txt'][0][:100]

'The effects of processing of barley-based supplements on rumen pH, rate of digestion of voluntary in'

### Get word embeddings, w2v feature matrix using corpus

In [11]:
# Get ngrams: 
# ##### This is not necessary #####
X        = corpus_df[target_col]
X_ngrams = script23.get_ngram(X, ngram, min_count, "train", work_dir=work_dir)

    load ngrams


In [12]:
# Get embeddings
embeddings, X_w2v = script23.get_embeddings(X, model_w2v, tokenizer, vocab)

In [13]:
embeddings.shape, X_w2v.shape

((398412, 300), (1385417, 500))

In [14]:
X_w2v[0][:20]

array([    1,     1,     1,   856,     1,   631,    65,     1,     1,
       22713,   493,    43,     1,  3786,     1,  8018,  1214,     1,
        4962,  1222], dtype=int32)

### Load model

In [24]:
model_emb = tf.keras.models.load_model(cp_filepath)


### Make predictions 

In [25]:
# This takes a long time so save the prediction and prediction prob to a file
# ~52 min
y_pred_prob   = model_emb.predict(X_w2v)
dic_y_mapping = {n:label for n,label in enumerate(np.unique([0,1]))}
y_pred        = [dic_y_mapping[np.argmax(pred)] for pred in y_pred_prob]

In [26]:
len(y_pred)

1385417

In [27]:
corpus_df["y_prob"] = y_pred_prob[:,1]
corpus_df["y_pred"] = y_pred
corpus_df.shape

(1385417, 9)

In [28]:
corpus_df['y_pred'].value_counts()

0    963677
1    421740
Name: y_pred, dtype: int64

In [29]:
#Save prediction file
corpus_df_file = work_dir / "pubmed_qual_1385417_w2v_pred_prob.tsv.gz"
corpus_df.to_csv(corpus_df_file, sep='\t', compression='gzip')

In [31]:
### Consolidate positive predictions into a compressed dataframe
corpus_df_pos_file = work_dir / "corpus_plant_421740.gz"
corpus_df[corpus_df['y_pred'] == 1].to_csv(corpus_df_pos_file, sep='\t', 
                                                            compression='gzip')

### Plot probability distribution

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(data=corpus_df, x="y_prob", hue="y_pred", data='pdf')
plt.savefig(work_dir / 'figure_pubmed_qual_1385417_w2v_pred_prob.pdf')

### Get tSNE plot