## Install the required libraries

In [1]:
# Install the required libraries
! pip install numpy==1.23.5
! pip install pandas==1.5.3
! pip install bertopic==0.14.1
! pip install umap-learn==0.5.3

[0m

## Import Libraries

In [2]:
# Import the necessary libraries
import pandas as pd
import numpy as np

from bertopic import BERTopic
from umap.umap_ import UMAP

# Set all columns and rows to be displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Import Data

In [3]:
# Import the data for modelling
journals = pd.read_csv('/kaggle/input/capstone-data/journals_processed.csv')

## Topic Modeling using BERTopic

In [4]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=42)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True, n_gram_range=(1,3))

# Run BERTopic model
topics, probabilities = topic_model.fit_transform(journals['tokens'])

In [5]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,298,-1_design_approach_paper_development
1,0,115,0_sysml_language_modeling_simulation
2,1,68,1_product_development_process_approach
3,2,57,2_safety_analysis_fault_reliability
4,3,56,3_mission_cubesat_approach_space
5,4,49,4_research_industry_adoption_practice
6,5,33,5_requirement_design_process_paper
7,6,26,6_ontology_process_semantic_information
8,7,20,7_vehicle_electric_electric vehicle_energy
9,8,19,8_security_design_threat_cps


In [6]:
# Print the keywords for each topic
for i in range(17):
    print(f"Topic: {i}")
    print(topic_model.get_topic(i))
    print()

Topic: 0
[('sysml', 0.026879232499111897), ('language', 0.021349722094603667), ('modeling', 0.020878416585180283), ('simulation', 0.01408367617524561), ('modeling language', 0.012800376385211773), ('design', 0.011486957773409603), ('approach', 0.010843375583161645), ('tool', 0.00999464957567643), ('process', 0.009648720706141763), ('paper', 0.008662405067752172)]

Topic: 1
[('product', 0.03239532027861697), ('development', 0.0200671918827323), ('process', 0.014181376570230166), ('approach', 0.012299876689239313), ('product development', 0.0116290868859484), ('production', 0.011229632331798212), ('manufacturing', 0.009275072044726464), ('industrial', 0.008807265015920086), ('paper', 0.00845321078196325), ('industry', 0.008426784814174044)]

Topic: 2
[('safety', 0.03675404184755206), ('analysis', 0.021835374960850643), ('fault', 0.014469418021845908), ('reliability', 0.01404489445334496), ('design', 0.014032629041676676), ('failure', 0.013830569731769674), ('safety analysis', 0.011822609

In [7]:
# Visualize the top 10 keywords for each topic
topic_model.visualize_barchart(n_words=10, top_n_topics=25, height = 400)

In [8]:
# Visualize intertopic distance
topic_model.visualize_topics()

**Summary of each topic**

|Topic|Summary|
|-----|-------|
|0| How to use MBSE|
|1| Product Development Process|
|2| Safety and Reliability|
|3| Aerospace|
|4| Adoption of MBSE|
|5| Requirements|
|6| Ontology| 
|7| Electric Vehicles|
|8| Physical and Cyber Security|
|9| Evaluation of MBSE Implementation and Verification and Validation|
|10| Military Navel Systems|
|11| Physical Human Systems such as Unmanned Ops|
|12| Verification and Validation|
|13| Adoption of MBSE, especially in the context of complexity, reuseability and cost effectiveness|
|14| Digital Twin|
|15| Verification and Validation|
|16| Aviation|



**Topics to Manually Merge in BERTopic**

|S/N|Topics to Merge|Reason|
|---|---------------|------|
|1| 9, 12, 15| Both topics are about verification and validation|
|2| 3, 7, 10, 11 16| Infrastructure and domain specific projects. Conceptually, they are similar|
|3| 4, 6, 13| Defining the engineering ontology is a key portion of adoption, hence these three topics are related|
|4| 8, 14| Combining the topics that are related to the digital realm like cybersecurity and digital twin|

In [9]:
#  Define the topics to merge together
topics_to_merge = [[9, 12, 15], 
                   [3, 7, 10, 11, 16],
                   [4, 6, 13],
                   [8, 14]]

# Manually merge the similar topics in BERTopic
topics_reduced = topic_model.merge_topics(journals['tokens'], topics_to_merge)

In [10]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,298,-1_design_approach_paper_development
1,0,117,0_design_mission_approach_development
2,1,115,1_sysml_language_modeling_simulation
3,2,88,2_ontology_paper_approach_research
4,3,68,3_product_development_process_approach
5,4,57,4_safety_analysis_design_fault
6,5,42,5_approach_development_design_method
7,6,33,6_requirement_design_process_paper
8,7,32,7_twin_digital twin_digital_security


In [11]:
# Visualize the top 10 keywords for each topic
topic_model.visualize_barchart(n_words=10, top_n_topics=25, height = 400)

In [12]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [13]:
# Update the topics
topics_updated = topic_model.topics_

In [14]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(journals['tokens'], topics_updated)
topic_model.update_topics(journals['tokens'], topics=new_topics)

In [15]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,221,0_design_approach_architecture_development
1,1,185,1_sysml_modeling_language_simulation
2,2,135,2_paper_approach_research_design
3,3,109,3_product_development_process_approach
4,4,66,4_safety_analysis_design_fault
5,5,46,5_approach_development_design_inconsistency
6,6,49,6_requirement_process_design_paper
7,7,39,7_digital_twin_digital twin_security


In [16]:
# Visualize the top 10 keywords for each topic
topic_model.visualize_barchart(n_words=10, top_n_topics=25, height = 400)

In [17]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [18]:
# Define the seed topic list
seed_topic_list = [['mission', 'architecture', 'project'],
                   ['sysml', 'language', 'modeling', 'simulation', 'modeling language'],
                   ['ontology', 'research', 'practice', 'method'],
                   ['product', 'production', 'manufacturing', 'mechatronic', 'management' ,'inconsistency'],
                   ['safety' ,'analysis', 'fault', 'reliability', 'failure', 'safety analysis'],
                   ['cubestat', 'engineer', 'specification', 'set', 'theory'],
                   ['twin', 'digital twin', 'digital', 'security', 'service', 'competency', 'data', 'critical']]

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", 
                       calculate_probabilities=True, n_gram_range=(1,3), 
                       seed_topic_list=seed_topic_list, nr_topics=8)

# Run BERTopic model
topics, probabilities = topic_model.fit_transform(journals['tokens'])

# Reduce outliers
new_topics = topic_model.reduce_outliers(journals['tokens'], topics_updated)
topic_model.update_topics(journals['tokens'], topics=new_topics)

In [19]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,344,0_design_approach_architecture_development
1,1,166,1_sysml_modeling_language_simulation
2,2,97,2_ontology_research_approach_paper
3,3,73,3_product_development_process_approach
4,4,57,4_safety_analysis_fault_reliability
5,5,48,5_digital_approach_method_development
6,6,33,6_requirement_design_process_paper
7,7,32,7_twin_digital twin_digital_security


In [20]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [21]:
# Visualize the top 10 keywords for each topic
topic_model.visualize_barchart(n_words=10, top_n_topics=25, height = 400)

In [22]:
# Update the topics
topics_final = topic_model.topics_

# Include the topics into the journals dataframe
journals['topic'] = topics_final

In [23]:
# Check the journals dataframe
journals.head()

Unnamed: 0,title,abstract,year,tokens,topic
0,Model-based Design Process for the Early Phase...,This paper presents an approach for a model-ba...,2017,paper present approach planning process early ...,3
1,Model Based Systems Engineering using VHDL-AMS,The purpose of this paper is to contribute to ...,2013,purpose paper contribute definition ( ) approa...,6
2,Code Generation Approach Supporting Complex Sy...,Code generation is an effective way to drive t...,2022,code generation effective way drive complex de...,1
3,Model based systems engineering as enabler for...,"Product complexity is steadily increasing, cus...",2021,"product complexity steadily increasing , custo...",3
4,Electric Drive Vehicle Development and Evaluat...,To reduce development time and introduce techn...,2014,reduce development time introduce technology f...,0


In [24]:
# Export the journals dataframe as a csv file
journals.to_csv('/kaggle/working/journals_topics.csv', index=False)