In [None]:
# INSTALL
!pip install bertopic --upgrade bertopic

Collecting bertopic
  Downloading bertopic-0.16.3-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Downloading Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collec

# 1. Set up - Import libraries, mount drive and check directory

In [None]:
# Import
import pandas as pd # pandas dataframe
import os # operating system

# Mount drive
from google.colab import drive
drive.mount('/content/gdrive')

# Change directory
os.chdir('/content/gdrive/MyDrive/RDS_Bert_2024/BERT_20240704')

# Check drive
print (os.getcwd())
print(os.listdir())

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/RDS_Bert_2024/BERT_20240704
['RawData.csv', 'Example_2.csv', 'Example_2.gsheet', 'RawData_without_stopwords.csv', 'Screening_Model_v1', 'RawData.gsheet', 'RawData_without_stopwords.gsheet', 'Topicmodelling1.ipynb']


 # 2. Preprocessing - Import NLKT library, import/download stopwords - remove these words from the text, load CSV file


In [None]:
# Preprocessing - Import NLTK library (Natural Language Toolkit) - a library used for NLP
# Import and download stopwords (common words such as - and, it, is, - are removed in text processing)
# Load CSV file and remove the stopwords from the text

import nltk
from nltk.corpus import stopwords

# Download the stopwords
nltk.download('stopwords')

# Define the stop words to remove
stop_words = set(stopwords.words('english'))

# Load the CSV file into a DataFrame
data = pd.read_csv('RawData.csv')

# Remove stopwords from the abstracts - split each abstract into individual words, filters out stopwords, and then joins the remaining words back into a single string
data['Scientific_Abstract'] = data['Scientific_Abstract'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Save the modified DataFrame to a new CSV file
data.to_csv('RawData_without_stopwords.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#3.Train and fit the model - import library e.g., Bertopic, extract abstracts as a list of strings, define the count vectorier, define a UMAP model and create a topic model

In [None]:
# Import necessary libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

# Load data and extract abstracts
data = pd.read_csv("RawData_without_stopwords.csv")
abstracts = data['Scientific_Abstract'].astype(str).tolist()

# Initialize CountVectorizer and UMAP
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
umap_model = UMAP(random_state=42)

# Create and fit the BERTopic model
topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model, language="english")
topics, probs = topic_model.fit_transform(abstracts)

# CountVectorizer - vectorizer converts textual data into numerical vectors, parameter (1,2) - the vectorizer will consider both unigrams (single words) and bigrams (pairs of consecutive words)
# Define a UMAP (Uniform Manifold Approximation and Projection) model for dimensionality reduction with fixed random state - DR technique used in data processing to simplify complex data by reducing the number of variables (dimensions)
#

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# 4. Save the model and view results

In [None]:
# Save the model
from bertopic import BERTopic
topic_model.save("Screening_Model_v1")



In [None]:
# Topics
topic_model.get_topic_info().head(50)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,901,-1_patients_care_health_people,"[patients, care, health, people, research, stu...",[Research question We plan evaluate feasibilit...
1,0,147,0_women_babies_pregnancy_birth,"[women, babies, pregnancy, birth, maternity, c...",[Aims This research assess new ways improve ac...
2,1,119,1_patients_heart_blood_study,"[patients, heart, blood, study, ulcers, risk, ...",[Aim(s) research: The trial testing whether be...
3,2,117,2_asthma_patients_copd_lung,"[asthma, patients, copd, lung, breathing, stud...",[Asthma commonest long-term disease children U...
4,3,66,3_research_health_local_public,"[research, health, local, public, public healt...",[What ‘Public Health Intervention Responsive S...
5,4,58,4_antibiotics_antibiotic_infection_infections,"[antibiotics, antibiotic, infection, infection...",[Aims research This research aims find shortes...
6,5,56,5_kidney_transplant_dialysis_ckd,"[kidney, transplant, dialysis, ckd, patients, ...",[Aims research This study investigate whether ...
7,6,54,6_vaccine_vaccination_vaccines_covid19,"[vaccine, vaccination, vaccines, covid19, covi...","[Research question: Influenza, MenACWY, HPV CO..."
8,7,46,7_health_research_systems_community,"[health, research, systems, community, countri...",[Background: Climate change projected increase...
9,8,45,8_care_social care_social_older,"[care, social care, social, older, carers, res...",[Background There little evidence Local Author...


In [None]:
# GET TOP 10 WORDS FOR TOPIC 2
topic_model.get_topic(2)[:10]

[('asthma', 0.02120914237822243),
 ('patients', 0.014267967606431292),
 ('copd', 0.011531489261033165),
 ('lung', 0.011252867624491068),
 ('breathing', 0.010133808065807059),
 ('study', 0.009904345529261628),
 ('breathlessness', 0.009698755212654515),
 ('trial', 0.008856635458841794),
 ('oxygen', 0.0075593345507311),
 ('children', 0.007194941817355812)]

In [None]:
# BAR CHARTS OF TOP 20 TOPICS (10 WORDS)
topic_model.visualize_barchart (width=280, height=330, top_n_topics=20, n_words=10)

In [None]:
#Add topic labels
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=True, word_length=10, separator=", ")

topic_model.set_topic_labels(topic_labels)

topic_model.set_topic_labels({0: "0 - Women & Pregnancy", 1: "1 - Heart & Blood", 2: "2 - Asthma & breathing"})

# Re-run Barchart visualisation
topic_model.visualize_barchart (width=280, height=330, top_n_topics=40, n_words=10, custom_labels=True)

In [None]:
# Heat map
topic_model.visualize_heatmap(n_clusters=20, custom_labels=True)

In [None]:
# Topics visualisation
topic_model.visualize_documents(abstracts, topics=list(range(30)),custom_labels=True, height=600)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Hierarchical clustering
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
# Intertopic distance map
topic_model.visualize_topics(custom_labels=True)

In [None]:
# Refine the algorithm by removing the outliers (EXCLUDE -1 UNKNOWN OUTLIERS)

# Load the data
abstract_data = pd.read_csv("RawData_without_stopwords.csv")

# Create a dataframe with project_ID, topic_ID and topic_probability
topic_df = pd.DataFrame({
    "ProjectID": abstract_data["ProjectID"],
    "Topic_ID": topics,
    "Topic_Prob": probs
})

# Remove outliers (Topic_ID = -1) and display a sample
filtered_df = topic_df[topic_df['Topic_ID'] != -1]
print(filtered_df.sample(30))


             ProjectID  Topic_ID  Topic_Prob
2149        NIHR205894        33    1.000000
2729  PB-PG-0817-20004         0    0.699548
1961        NIHR204706        24    0.822661
120      AI_AWARD01723        17    0.664356
1279        NIHR200490        16    0.816598
1312        NIHR200731        12    0.795619
2268        NIHR300504         2    1.000000
928         NIHR135278        26    1.000000
2769  PB-PG-1217-20018         2    0.758597
1183        NIHR156535        25    1.000000
891         NIHR134942         9    1.000000
391         NIHR128768        15    0.547581
1606        NIHR202753        52    1.000000
1283        NIHR200510        49    0.678455
2408        NIHR301634        51    1.000000
762         NIHR133168        46    0.945218
1573        NIHR202625        32    0.775402
1795        NIHR203681        36    1.000000
2209        NIHR206524        34    1.000000
470         NIHR129848        17    0.796308
314         NIHR127773        11    1.000000
2441      

In [None]:
# Refine the algorithm for a particular topic (TOPIC 0 - Women & Pregnancy)

# Combine the topic IDs, topic probabilities, and the desired column from your source data
abstract_data = pd.read_csv ("RawData_without_stopwords.csv")

# Extract the ProjectID column
ProjectID = abstract_data["ProjectID"]

# create a new data frame with a project ID, topic ID, probability and the abstract filtered for topic ID0 which is women and baby
topic_df = pd.DataFrame({
"ProjectID": abstract_data["ProjectID"],
"Topic_ID": topics,
"Topic_Prob": probs,
"Document": abstracts
})
filtered_df = topic_df[topic_df['Topic_ID'] == 0]
print (filtered_df.sample(30))

                     ProjectID  Topic_ID  Topic_Prob  \
89                    17/89/07         0    1.000000   
1744                NIHR203474         0    1.000000   
1635                NIHR202920         0    0.843806   
1656                NIHR203024         0    1.000000   
238   ICA-CDRF-2018-04-ST2-020         0    0.836748   
1624                NIHR202850         0    1.000000   
1394                NIHR201424         0    1.000000   
2707          PB-PG-0418-20005         0    1.000000   
1325                NIHR200791         0    0.913488   
455                 NIHR129715         0    0.717452   
2610                NIHR302993         0    1.000000   
2537                NIHR302513         0    0.713743   
1337                NIHR200869         0    1.000000   
607                 NIHR131339         0    0.685927   
337                 NIHR127976         0    1.000000   
847                 NIHR134293         0    0.654456   
611                 NIHR131352         0    1.00

In [None]:
# Can I use the algorithm to predict the topic and probability for a new abstract (e.g. example 2)?

# Load and extract the abstract, convert the data to a string
new_abstracts = pd.read_csv("Example_2.csv")["Scientific_Abstract"].astype(str).tolist()

# Predict topics and probabilities for example, 2
predicted_topics, predicted_probs = topic_model.transform(new_abstracts)

# Print results - example two has been assigned topic 0 with a probability of 0.65 (and also topic 39 with 0.42)
print(predicted_topics, predicted_probs)


[0, 39] [0.65714371 0.42680116]
