In [1]:
import pandas as pd
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModel
import os
import re
import time
import unicodedata
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, models
# Load model directly
from transformers import AutoModelForMaskedLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from umap import UMAP
from hdbscan import HDBSCAN
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

In [2]:
# Load the merged data
input_path = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\Merged Data\Merged_data_ML.csv"
df_ml = pd.read_csv(input_path, sep=',', encoding='utf-8')



Columns (17) have mixed types. Specify dtype option on import or set low_memory=False.



In [3]:

# Define the normalization function
def normalize_text(text):
    # 1. Unicode standardization
    text = unicodedata.normalize("NFKC", text)
    # 2. Lowercase conversion
    text = text.lower()
    # 3. Remove HTML tags and references
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\[\d+\]', '', text)
    # 4. Replace non-alphanumeric characters with space
    text = re.sub(r'[^\w\s.!?]', ' ', text)
    # 5. Merge multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply to the DataFrame
df_ml['clean_text'] = df_ml['patent_text'].fillna("").apply(normalize_text)

In [4]:
print(df_ml.columns.tolist())


['pub_dt', 'predict93_ml', 'ai_score_ml', 'predict93_evo', 'ai_score_evo', 'predict93_nlp', 'ai_score_nlp', 'predict93_speech', 'ai_score_speech', 'predict93_vision', 'ai_score_vision', 'predict93_planning', 'ai_score_planning', 'predict93_kr', 'ai_score_kr', 'predict93_hardware', 'ai_score_hardware', 'patent_id', 'patent_type', 'patent_title', 'num_claims', 'disambig_assignee_organization', 'assignee_type', 'patent_abstract', 'patent_text', 'clean_text']


In [5]:
# High Frequency Words Extraction

# 1. Self-contained stop words for domain-specific filtering
domain_stop_words = {
    'method', 'system', 'invention', 'apparatus',
    'device', 'provide', 'include', 'comprise', 'includes',
    'comprising', 'comprises', 'wherein', 'whereby',
    'embodiment', 'disclosure', 'application', 'patent',
    'user', 'information', 'processing',
    'set', 'based','gain','data'
}
custom_stop_words = list(ENGLISH_STOP_WORDS.union(domain_stop_words))

# 2. Load the dataset
df_ml['pub_dt'] = pd.to_datetime(
    df_ml['pub_dt'],
    errors='raise',              
    infer_datetime_format=True   
)
texts = df_ml['clean_text'].astype(str).tolist()
timestamps = df_ml['pub_dt'].to_list()

# 3. Identify the high-frequency words using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words=custom_stop_words, max_df=0.9, min_df=1, ngram_range=(1, 5))
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1
word_scores = dict(zip(feature_names, tfidf_scores))

# 4. Sort and display the top 20 high-frequency words
sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
print("Top 20 High frequency words：", sorted_words[:20])

# # Generate a word cloud for visualization, used for future reference
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

Top 20 High frequency words： [('image', np.float64(824.2106871660712)), ('model', np.float64(728.8277564517223)), ('network', np.float64(643.9653815249948)), ('plurality', np.float64(598.5892419474391)), ('learning', np.float64(558.1022608280825)), ('second', np.float64(540.318411762814)), ('using', np.float64(519.7525756753726)), ('input', np.float64(489.6430006511739)), ('machine', np.float64(461.70585446899304)), ('neural', np.float64(448.9299799688211)), ('content', np.float64(445.1731601385512)), ('training', np.float64(434.30426884814676)), ('systems', np.float64(427.07692674879223)), ('methods', np.float64(423.1146390849231)), ('computer', np.float64(415.3510781972855)), ('associated', np.float64(412.21876217530996)), ('machine learning', np.float64(394.8253492041094)), ('feature', np.float64(389.79874815057417)), ('neural network', np.float64(375.1116120876223)), ('object', np.float64(366.8138859259728))]


In [6]:
from nltk import sent_tokenize
import nltk
from tqdm import tqdm
import random
import psutil
import numpy as np
# Sentence analysis and embedding generation test
# Download punkt_tab to fix LookupError
nltk.download('punkt_tab')

# 1. Split sentences from the cleaned text and keep the index of the document
sentences = []
doc_ids = []
for doc_id, text in enumerate(tqdm(df_ml['clean_text'].astype(str), desc="Splitting sentences")):
    doc_sentences = sent_tokenize(text)
    for sentence in doc_sentences:
        if len(sentence.split()) >= 8:
            sentences.append(sentence)
            doc_ids.append(doc_id)

# Validate the length of sentences and doc_ids
print(f"Length of sentences: {len(sentences)}")
print(f"Length of doc_ids: {len(doc_ids)}")
sentence_df = pd.DataFrame({'sentence': sentences, 'doc_id': doc_ids})
end_preprocess = time.time()
# print(f"▶ Preprocessing time cost: {end_preprocess - start_preprocess:.0f} seconds")
# print(f"▶ Number of sentences: {len(sentences)}")
# Validate the split sentences
sentence_counts = [len(sent_tokenize(text)) for text in df_ml['clean_text'].astype(str)]
print("Sentence count per document:")
print(pd.Series(sentence_counts).describe())

# 2. Generate embeddings for the sentences using SentenceTransformer
start_embed = time.time()
embed_model = SentenceTransformer("AI-Growth-Lab/PatentSBERTa", device="cuda")
sentence_embeddings = embed_model.encode(
    sentences,
    batch_size=64,  
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
end_embed = time.time()
print(f"▶ Embedding time cost: {end_embed - start_embed:.0f} seconds")


# 3. Configure UMAP, HDBSCAN models, amd representation models
umap_model = UMAP(
    n_neighbors=10, 
    n_components=5, 
    min_dist=0.02,
    metric='cosine', 
    low_memory=True,
    random_state=42
    )

hdbscan_model = HDBSCAN(
    min_cluster_size=20, 
    min_samples=5,
    cluster_selection_method="leaf",
    cluster_selection_epsilon=0.02,
    prediction_data=True)

# Representation models for BERTopic
# KeyBERT
keybert_model = KeyBERTInspired()
# Part-of-Speech    
pos_model = PartOfSpeech("en_core_web_sm")
# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)
# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}
# 4. Optional: Batch UMAP
batch_size = 100000
umap_embeddings = []
start_umap = time.time()
for i in range(0, len(sentence_embeddings), batch_size):
    batch_embeddings = sentence_embeddings[i:i+batch_size]
    batch_umap = umap_model.fit_transform(batch_embeddings)
    umap_embeddings.append(batch_umap)
    
umap_embeddings = np.vstack(umap_embeddings)
end_umap = time.time()
print(f"▶ UMAP time: {end_umap - start_umap:.0f} seconds")

# 5. Define the CountVectorizer used for c-TF-IDF
vectorizer_model = CountVectorizer(
    stop_words=custom_stop_words,
    max_df=0.8,  # More lenient filtering to match c-TF-IDF settings
    min_df=5,
    ngram_range=(1, 3),
    max_features=10000  # Set the maximum number of features to 10,000
)
# 6. Define seed topics for BERTopic
seed_topic_list = [
    ["neural", "network", "deep", "learning", "convolutional", "recurrent"],
    ["support", "vector", "machine", "svm", "kernel"],
    ["decision", "tree", "random", "forest", "gradient", "boosting"],
    ["reinforcement", "learning", "agent", "policy", "reward"],
    ["natural", "language", "processing", "nlp", "transformer", "attention", "bert"],
    ["clustering", "kmeans", "hierarchical", "dbscan"],
    ["graph", "node", "edge", "gnn", "embedding"],
    ["generative", "adversarial", "network", "gan"],
    ["transfer", "learning", "domain", "adaptation"],
    ["bayesian", "inference", "probabilistic", "model"],
    ["computer", "vision", "image", "segmentation"],
    ["unsupervised", "learning", "autoencoder", "vae"],
    ["time", "series", "forecasting", "lstm"],
    ["pca", "tsne", "umap", "reduction"],
    ["anomaly", "outlier", "detection", "rare"],
    ["gradient", "descent", "optimizer", "adam", "sgd"],
    ["fairness", "bias", "explainable", "interpretability", "shap", "lime"],
    ["federated", "privacy", "secure", "decentralized"],
    ["automl", "nas", "search", "hyperparameter", "tuning"],
    ["system", "module", "processor", "memory", "plurality"],              
    ["data", "processing", "input", "output", "feature", "extraction"],   
    ["model", "training", "learning", "fine-tuning", "optimization"],     
    ["image", "segmentation", "detection", "reconstruction", "processing"],
    ["automl", "nas", "search", "hyperparameter", "tuning"],
    ["patent", "invention", "claim", "method", "apparatus", "system"]
]

# 7. Initialize and fit the BERTopic model
topic_model = BERTopic(
    embedding_model=embed_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,  # Use CountVectorizer
    representation_model=representation_model,
    # Hyperparameters
    top_n_words=10,
    min_topic_size=20,
    # nr_topics=80,
    seed_topic_list=seed_topic_list,
    calculate_probabilities=False,
    nr_topics=200
)



[nltk_data] Downloading package punkt_tab to C:\Users\Yue
[nltk_data]     Qiao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Splitting sentences: 100%|██████████| 87678/87678 [00:05<00:00, 16984.92it/s]


Length of sentences: 388970
Length of doc_ids: 388970
Sentence count per document:
count    87678.000000
mean         4.929960
std          2.091476
min          1.000000
25%          3.000000
50%          5.000000
75%          6.000000
max         32.000000
dtype: float64


Batches:   0%|          | 0/6078 [00:00<?, ?it/s]

▶ Embedding time cost: 600 seconds
▶ UMAP time: 282 seconds


In [7]:
# Save the sentence embeddings to a file
import numpy as np, os
save_dir = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\Embeddings"
os.makedirs(save_dir, exist_ok=True)

emb_path = os.path.join(save_dir, "embeddings.npy")
np.save(emb_path, sentence_embeddings)

print("✅ Embedding is saved to:", emb_path)

✅ Embedding is saved to: E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\Embeddings\embeddings.npy


In [9]:
# When to load the embeddings later, uncomment the following line:
#embeddings = np.load(r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\Embeddings\embeddings.npy")
# Start the topic modeling process
start_tm = time.time()
topics = topic_model.fit_transform(
    sentences,
    sentence_embeddings  
)
end_tm = time.time()
print(f"▶ BERTopic Cludester and topic extraction cost：{end_tm - start_tm:.0f} seconds")

# # Time prediction (training phase) 
# fit_time_per_sentence = (end_tm - start_tm) / len(sentences)
# estimated_fit_time = fit_time_per_sentence * full_sentence_estimate
# print(f"▶ Estimated training time for full dataset ({full_sentence_estimate:.0f} sentences): {estimated_fit_time:.0f} seconds")
# print(f"▶ Estimated total time (embedding + training): {estimated_embed_time + estimated_fit_time:.0f} seconds")  

▶ BERTopic Cludester and topic extraction cost：841 seconds


In [10]:
# Save the model and topics

import pickle
save_dir = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model"
os.makedirs(save_dir, exist_ok=True)

model_path  = os.path.join(save_dir, "my_bertopic_model")  
topics_path = os.path.join(save_dir, "topics.pkl")
#probs_path  = os.path.join(save_dir, "probs.npy")

topic_model.save(model_path, serialization="pickle")
# with open(topics_path, "wb") as f:
#     pickle.dump(topics, f)
# np.save(probs_path)

print("✅ Model is saved to：", model_path)
print("✅ topics.pkl Save completed：", topics_path)
#print("✅ probs.npy  Save completed", probs_path, f"(shape: {probs.shape})")



✅ Model is saved to： E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\my_bertopic_model
✅ topics.pkl Save completed： E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\topics.pkl


In [11]:
# Load the model and transform the sentences
topics, probs = topic_model.transform(
  sentences,
  embeddings=sentence_embeddings
)


In [12]:
# Reduce outliers (Topic -1) using c-TF-IDF

new_topics = topic_model.reduce_outliers(   
    sentences,
    topics,
    strategy="c-tf-idf",
    threshold=0.05  
)


In [13]:
# Update topics with the new topics
topic_model.update_topics(
    sentences,      # docs 列表
    new_topics,     # topics 列表
    vectorizer_model=vectorizer_model
)

topics = new_topics
# from collections import Counter

# print("new_topics distributions:", Counter(new_topics))              
# print(topic_model.get_topic_info().head())  


In [14]:
# save the new topics and probs
save_dir = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model"
os.makedirs(save_dir, exist_ok=True)
topics_path = os.path.join(save_dir, "topics.pkl")
probs_path  = os.path.join(save_dir, "probs.npy")
with open(topics_path, "wb") as f:
    pickle.dump(topics, f)

# 5. Save the probabilities
np.save(probs_path, probs)

print(f"✅ topics.pkl is saved to：{topics_path}")
print(f"✅ probs.npy is saved to: {probs_path} (shape: {probs.shape})")

doc_ids_path = os.path.join(save_dir, "doc_ids.pkl")
with open(doc_ids_path, "wb") as f:
    pickle.dump(doc_ids, f)
print(f"✅ doc_ids.pkl is saved to:{doc_ids_path}")

✅ topics.pkl is saved to：E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\topics.pkl
✅ probs.npy is saved to: E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\probs.npy (shape: (388970,))
✅ doc_ids.pkl is saved to:E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\doc_ids.pkl


In [15]:
# #For loading the model and topics later, you can use the following code:
from bertopic import BERTopic
import pickle, numpy as np
import pandas as pd
import os


In [16]:
model_dir   = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\my_bertopic_model"
topics_path = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\topics.pkl"
probs_path  = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\probs.npy"
docs_path   = r"E:\Waterloo-onedrive\OneDrive - University of Waterloo\MSE 641\BERTopic model\doc_ids.pkl"


In [17]:

topic_model = BERTopic.load(model_dir)
with open(topics_path, "rb") as f:
    topics = pickle.load(f)  # list of topic_ids


probs = np.load(probs_path, allow_pickle=True)  # shape: (n_docs, n_topics)

with open(docs_path, "rb") as f:
    doc_ids = pickle.load(f)


# 1. Extract and export topic information (improve deduplication)
out_dir = os.path.dirname(model_dir)
topic_info = topic_model.get_topic_info()


def dedup_phrases(phrase_list):
    cleaned = []
    for p in phrase_list:
        if any(p in q.split() for q in cleaned if p != q):
            continue
        cleaned = [q for q in cleaned if not (q in p.split() and q != p)]
        cleaned.append(p)
    return cleaned

# 2. Transform the Name column to a list of phrases and deduplicate
def clean_name(name_str):
    phrases = name_str.split('_')        # The original phrases are separated by underscores
    deduped = dedup_phrases(phrases)

    return "_".join(deduped[:5])
# Output the cleaned names
topic_info['clean_name'] = topic_info['Name'].apply(clean_name)

# Save the topic_info
topic_info.to_csv(os.path.join(out_dir, "topic_info.csv"),
                  index=False, encoding="utf-8-sig")
# Save ID→Name mapping
topic_info[['Topic','clean_name']].to_csv(
    os.path.join(out_dir, "topic_id_name.csv"),
    index=False, encoding="utf-8-sig"
)

# 3. Create a DataFrame for documents and their topics
df_docs = pd.DataFrame({
    "doc_id": doc_ids,
    "topic_id": topics,
    "max_prob": [p.max() for p in probs]
})
df_docs = df_docs.merge(
    topic_info[['Topic','clean_name']],
    left_on='topic_id', right_on='Topic',
    how='left'
).drop(columns=['Topic']).rename(columns={'clean_name':'topic_name'})
df_docs.to_csv(os.path.join(out_dir, "docs_topics.csv"),
               index=False, encoding="utf-8-sig")

print("Remove repulication and the result is saved：")
print("   - topic_info.csv")
print("   - topic_id_name.csv ）")
print("   - docs_topics.csv ")

Remove repulication and the result is saved：
   - topic_info.csv
   - topic_id_name.csv ）
   - docs_topics.csv 
