# __Step 4: Topic model__

The kmean cluserting results are not particularly clear what's going on. So go stiraght to topic modeling.

## ___Set up___

### Module import

In [1]:
import os, re
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from bertopic import BERTopic

### Key variables

In [2]:
# Reproducibility
seed = 20220609

# Setting working directory
proj_dir   = Path.home() / "projects/plant_sci_hist"
work_dir   = proj_dir / "4_topic_model/4_1_get_topics"
work_dir.mkdir(parents=True, exist_ok=True)

os.chdir(work_dir)

# plant science corpus
dir25       = proj_dir / "2_text_classify/2_5_predict_pubmed"
corpus_file = dir25 / "corpus_plant_421658.tsv.gz"

# qualified feature names
dir31          = proj_dir / "3_key_term_temporal/3_1_pubmed_vocab"
X_vec_file     = dir31 / "tfidf_sparse_matrix_4542"
feat_name_file = dir31 / "tfidf_feat_name_and_sum_4542"


### Proprecess corpus

In [3]:
corpus_df = pd.read_csv(corpus_file, sep='\t', compression='gzip')
corpus_df.head(2)

Unnamed: 0.1,Unnamed: 0,PMID,Date,Journal,Title,Abstract,QualifiedName,txt,reg_article,y_prob,y_pred
0,3,61,1975-12-11,Biochimica et biophysica acta,Identification of the 120 mus phase in the dec...,After a 500 mus laser flash a 120 mus phase in...,spinach,Identification of the 120 mus phase in the dec...,1,0.716394,1
1,4,67,1975-11-20,Biochimica et biophysica acta,Cholinesterases from plant tissues. VI. Prelim...,Enzymes capable of hydrolyzing esters of thioc...,plant,Cholinesterases from plant tissues. VI. Prelim...,1,0.894874,1


In [4]:
def clean_text(x):
    x = str(x)
    x = x.lower()
    # Replace any non-alphanumric characters of any length
    # Q: Not sure what the # character do.
    x = re.sub(r'#[A-Za-z0-9]*', ' ', x)
    # tokenize and rid of any token matching stop words
    tokens = word_tokenize(x)
    x = ' '.join([w for w in tokens if not w in stop_words_dict])
    return x

In [5]:
docs       = corpus_df['txt']
stop_words = stopwords.words('english')
stop_words_dict = {}
for i in stop_words:
  stop_words_dict[i] = 1

In [7]:
docs_clean = []
for doc_idx in tqdm(range(len(docs))):
  doc = docs[doc_idx]
  docs_clean.append(clean_text(doc))
len(docs_clean)

100%|██████████| 421658/421658 [13:21<00:00, 526.01it/s]


421658

In [8]:
# Try mp
import multiprocessing as mp
docs_clean = []

#  for dict_results in tqdm(pool.imap(ks_test, range(cluster_num)), 
#                           total=cluster_num):
#    dict_results_list.append(dict_results)

pool = mp.Pool(mp.cpu_count())
for dict_results in tqdm(pool.imap(clean_text, range(docs)), 
#                           total=cluster_num):


for doc_idx in tqdm(range(len(docs))):
  doc = docs[doc_idx]
  docs_clean.append(clean_text(doc, stop_words_dict))

pool.close()

SyntaxError: invalid syntax (3263796803.py, line 14)

In [9]:
timestamps = corpus_df.Date

## ___Run BERTopic___

### Initialize

- language: str = 'english'
- top_n_words: int = 10
  - The number of words per topic to extract. __Setting this too high can negatively impact topic embeddings__ as topics are typically best represented by at most __10 words__.
- n_gram_range: Tuple[int, int] = (1, 1)
  - The n-gram range for the CountVectorizer, between 1 and 3, otherwise memory issue.
- min_topic_size: int = 10
  - The minimum size of the topic.
- nr_topics: Union[int, str] = None
  - Specifying the number of topics will reduce the initial number of topics to the value specified.
  - Use __"auto"__ to automatically reduce topics using HDBSCAN
- calculate_probabilities: bool = False
  - Whether to calculate the probabilities of all topics per document instead of the probability of the assigned topic per document.
  - Will significantly increase computing time if True.
- diversity: float = None
  - Whether to use MMR to diversify the resulting topic representations.
  - Value between 0 (no divresity) and 1 (very diverse).
    - __Q: What does diversity mean here?__
- seed_topic_list: List[List[str]] = None
  - A list of seed words per topic to converge around.
- embedding_model=None
  - SentenceTransformers, Flair, Spacy, Gensim, USE (TF-Hub), or [these](https://www.sbert.net/docs/pretrained_models.html).
  - Try to use `allenai-specter`.
- umap_model: umap.umap_.UMAP = None
- hdbscan_model: hdbscan.hdbscan_.HDBSCAN = None
- vectorizer_model: sklearn.feature_extraction.text.CountVectorizer = None
- verbose: bool = False

In [10]:
topic_model = BERTopic(calculate_probabilities=False,
                       n_gram_range=(1,2),
                       min_topic_size=1000, 
                       nr_topics='auto',
                       embedding_model='allenai-specter',
                       verbose=True)

### Fit_transform

Long run time. Switch to HPC.

In [11]:
topics = topic_model.fit_transform(docs_clean)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/622 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/462k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/222k [00:00<?, ?B/s]

KeyboardInterrupt: 