In [2]:
from bertopic import BERTopic

from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance
from sklearn.datasets import fetch_20newsgroups
import pandas as pd  # Import pandas after reinstalling it
from sklearn.model_selection import train_test_split

In [3]:
# Documents to train on
df = pd.read_csv("output.csv")
df = df.loc[(df.label == "standard") |(df.label == "requirement")  ]
print(df.groupby("label").count())
print(df.head())
requirements = df.text.tolist()
labels = df.label.map({'standard': 1, 'requirement': 0}).tolist()

             text  document
label                      
requirement  1303      1303
standard      204       204
                                                text        label  \
0  CSP_Mid.CBF shall have a Maintenance Down Time...  requirement   
1  When commanded, CSP_Mid.CBF shall perform auto...  requirement   
2   Each box end end blast station paddle lift sh...  requirement   
3   Each pipe shall be transferred into the box e...  requirement   
4   Each of the vrollers shall be used on many st...  requirement   

         document  
0          SKAMid  
1          SKAMid  
2  JCanadaWelding  
3  JCanadaWelding  
4  JCanadaWelding  


In [4]:
train_requs , test_requs , train_labels , test_labels = train_test_split (
requirements , labels , random_state =500 , test_size =.2)

In [5]:
#print(train_requs)

In [6]:
from bertopic import BERTopic
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(hdbscan_model=hdbscan_model)


In [7]:

# The main representation of a topic
main_representation = KeyBERTInspired()


# Additional ways of representing a topic
aspect_model1 = PartOfSpeech("en_core_web_sm")
aspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]

# Add all models together to be run in a single `fit`
representation_model = {
   "Main": main_representation,
   "Aspect1":  aspect_model1,
   "Aspect2":  aspect_model2 
}
topic_model = BERTopic(representation_model=representation_model, hdbscan_model=hdbscan_model).fit(train_requs)




In [8]:
topic_model.visualize_documents(train_requs)


In [9]:
topic_model.visualize_heatmap()


In [10]:
topic_model.visualize_hierarchical_documents()

TypeError: BERTopic.visualize_hierarchical_documents() missing 2 required positional arguments: 'docs' and 'hierarchical_topics'

In [11]:
import pandas as pd

topic_embeddings = topic_model.get_topic_info()

# Create a DataFrame from the topic embeddings
df = pd.DataFrame(topic_embeddings)

# Save the DataFrame to a CSV file
df.to_csv('topic_embeddings.csv', index=False)

In [11]:
hierarchical_topics = topic_model.hierarchical_topics(train_requs)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

import matplotlib.pyplot as plt
import networkx as nx

# Assuming `tree` is the topic tree
G = nx.DiGraph(tree)
pos = nx.nx_agraph.graphviz_layout(G, prog='dot')

plt.figure(figsize=(10, 10))
nx.draw(G, pos, with_labels=True)
plt.savefig('topic_tree.png')
plt.show()

  0%|          | 0/28 [00:00<?, ?it/s]

 11%|█         | 3/28 [00:00<00:00, 25.94it/s]

 21%|██▏       | 6/28 [00:00<00:00, 25.54it/s]

 32%|███▏      | 9/28 [00:00<00:00, 24.67it/s]

 43%|████▎     | 12/28 [00:00<00:00, 24.46it/s]

 54%|█████▎    | 15/28 [00:00<00:00, 24.24it/s]

 64%|██████▍   | 18/28 [00:00<00:00, 24.59it/s]

 75%|███████▌  | 21/28 [00:00<00:00, 24.27it/s]

 86%|████████▌ | 24/28 [00:00<00:00, 23.84it/s]

 96%|█████████▋| 27/28 [00:01<00:00, 22.61it/s]

100%|██████████| 28/28 [00:01<00:00, 23.70it/s]




.
├─mhz_frequency_range_precision_mid
│    ├─csp_cspmidcbf_mhz_cspmidlmc_frequency
│    │    ├─beamchannel_csp_mhz_cspmidcbf_rfi
│    │    │    ├─■──mhz_pulsar_beamforming_frequency_beam ── Topic: 16
│    │    │    └─csp_cspmidcbf_mhz_frequency_rfi
│    │    │         ├─■──beamchannels_beamchannel_rfi_mhz_bandwidths ── Topic: 12
│    │    │         └─■──csp_cspmidcbf_cspmidlmc_frequency_receptor ── Topic: 4
│    │    └─iec_cspmidcbf_csp_cpf_compliant
│    │         ├─■──cspmidcbf_csp_icd_sdp_is1msdpcsp001 ── Topic: 6
│    │         └─■──iec_cspmidcbf_csp_cpf_mid ── Topic: 2
│    └─sensitivity_ska1mid_range_rms_precision
│         ├─ska1mid_sensitivity_polarization_dish_range
│         │    ├─■──dsh_tm_monitoring_mode_modes ── Topic: 1
│         │    └─ska1mid_sensitivity_polarization_dish_range
│         │         ├─■──dish_pointing_ranging_tracking_coordinates ── Topic: 9
│         │         └─■──ska1_ska1mid_sensitivity_polarization_dish ── Topic: 10
│         └─■──precision_pointing

NetworkXError: Input is not a correct scipy sparse array type.

In [12]:
import pandas as pd

# Call the get_topic_info() method and save the output
topic_info = topic_model.get_topic_info()


# Convert the output to a pandas DataFrame
df = pd.DataFrame(topic_info)


# Save the DataFrame to a CSV file
df.to_csv('topic_info.csv', index=False)

In [13]:
topic_model.visualize_topics()

In [14]:


df = topic_model.get_document_info(train_requs)
df.to_csv('document info', index=False)

In [15]:
topic_model.visualize_barchart()



In [17]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Aspect1,Aspect2,Representative_Docs
0,-1,232,-1_ghz_rf_frequency_signal,"[ghz, rf, frequency, signal, station, band, re...","[band, frequency, number, rf, part, power, dat...","[ghz, rf, frequency, signal, receptor, languag...",[When commanded and processing data from Band ...
1,0,148,0_pipe_lift_lowered_lifting,"[pipe, lift, lowered, lifting, lifted, conveyo...","[pipe, station, end, vrollers, box, stop, thre...","[pipe, lift, conveyor, adjustable, mechanism, ...",[ Each pipe shall gravity roll to the adjustab...
2,1,86,1_dsh_tm_monitoring_mode,"[dsh, tm, monitoring, mode, modes, capability,...","[conditions, lightning, mode, state, equipment...","[dsh, tm, monitoring, modes, capability, comma...",[TM shall control DSH using the DSH external s...
3,2,86,2_iec_cspmidcbf_csp_cpf,"[iec, cspmidcbf, csp, cpf, mid, compliant, ele...","[latest, version, accordance, equipment, air, ...","[iec, cspmidcbf, radiated, emiemc, standards, ...",[CSP_Mid.CBF equipment connected to the CPF el...
4,3,58,3_retests_testing_retest_test,"[retests, testing, retest, test, specimen, spe...","[test, castings, specimen, specification, cast...","[retests, specification, specimens, casting, h...",[If the results of a valid test fail to confor...
5,4,48,4_csp_cspmidcbf_cspmidlmc_frequency,"[csp, cspmidcbf, cspmidlmc, frequency, recepto...","[subarray, receptor, time, delay, receptors, p...","[cspmidlmc, frequency, receptor, delay, utc, s...","[When commanded, CSP_Mid.CBF shall report at l..."
6,5,47,5_pipelines_pipe_tests_specimen,"[pipelines, pipe, tests, specimen, trench, tes...","[pipe, in, concrete, mm, diameter, test, lengt...","[pipelines, tests, trench, thickness, flatteni...",[The flattening test of Specification A530/A53...
7,6,47,6_cspmidcbf_csp_icd_sdp,"[cspmidcbf, csp, icd, sdp, is1msdpcsp001, inte...","[interface, packet, section, compliant, packet...","[cspmidcbf, icd, sdp, is1msdpcsp001, protocol,...",[The visibility interface between SDP and CSP_...
8,7,38,7_safety_equipment_personnel_ergonomics,"[safety, equipment, personnel, ergonomics, gua...","[personnel, equipment, safety, floor, system, ...","[equipment, personnel, ergonomics, guards, pro...",[The Supplier's equipment and system shall be ...
9,8,36,8_threading_line_inspection_pipe,"[threading, line, inspection, pipe, stations, ...","[threading, station, end, line, box, pin, dope...","[threading, inspection, pipe, stations, pin, a...",[ Each threading line shall have a box end end...


In [16]:
topic_model.visualize_heatmap()

In [14]:
topics, probs = topic_model.transform(test_requs)

In [9]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(train_requs, calculate_tokens=True)



In [19]:
topic_model.visualize_distribution(topic_distr[3])


In [20]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(test_requs, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(test_requs[3], topic_token_distr[3])
df


Unnamed: 0,Steel,cast,in,ingots,or,strand,cast.1,is,permissible,When,...,material,by,any,estab_x0002_lished,procedure,that,positively,separates,the,grades
6_warranties_warranty_warrants_repair,0.0,0.0,0.139523,0.139523,0.139523,0.139523,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11_mixture_density_mixing_cement,0.0,0.0,0.100273,0.100273,0.100273,0.100273,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21_tooling_tools_equipment_components,0.0,0.0,0.110853,0.110853,0.110853,0.110853,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24_cspmidcbf_cspmidlmc_faults_maintenance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.213313,0.213313,0.108013,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28_cspmidcbf_cspmidlmcprovided_phasebinned_phase,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.111937,0.111937,0.111937,0.111937,0.0
33_hardness_steel_alloys_iron,0.134431,0.134431,0.134431,0.134431,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50_criteria_fat_equipment_procedure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.131718,0.261752,0.397334,0.532916,0.401198,0.271164,0.135582,0.0,0.0
57_nameplate_equipment_stainless_supplier,0.102535,0.102535,0.102535,0.102535,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58_cspmidcbf_markings_packaging_labelled,0.0,0.0,0.127303,0.127303,0.127303,0.127303,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62_steel_metallic_metal_materials,0.126843,0.126843,0.126843,0.126843,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
topic_model.visualize_distribution(topic_distr[3])

In [12]:
# Preprocess your test data (test_requs) if necessary


# Use the trained BERTopic model for inference
test_topic_assignments, _ = topic_model.transform(test_requs)

# Retrieve the topic labels mapping from the BERTopic model
topic_labels = topic_model.get_topic_info()

# Create a dictionary to map numeric topic IDs to text labels
topic_id_to_label = {id: label for id, label in zip(topic_labels["Topic"], topic_labels["Name"])}

# Print the actual text and the assigned topic labels as text labels
for i, (test_doc, test_doc_topic) in enumerate(zip(test_requs, test_topic_assignments)):
    print(f"Test Document {i}:")
    print("Text:", test_doc)
    print("Assigned Topic:", topic_id_to_label[test_doc_topic])
    print("\n")


Test Document 0:
Text:  We shall provide engineering drawings in AutoCAD format computer software and documentation user manual maintenance manual preventive maintenance schedule bill of material configuration and setup information safety information and troubleshooting information and all documentation shall be provided on three CDs and printed upon request from the customer. 
Assigned Topic: 30_requirements_specifications_documents_manuals


Test Document 1:
Text: If the purchaser requires soundness tests to be performed, it shall be so stated in the purchase agreement, and the method and soundness requirements shall be detailed
Assigned Topic: -1_rf_ghz_mhz_frequency


Test Document 2:
Text: CSP_Mid.CBF shall require less than 18 Direct Maintenance Hours (DMH) per month at the Organizational Level (O-Level).
Assigned Topic: -1_rf_ghz_mhz_frequency


Test Document 3:
Text: Steel cast in ingots or strand cast is permissible. When steels of different grades are sequentially strand cast