In [2]:
import pickle, os, json

with open('clusters_topics_all.pkl', 'rb') as f:
    clusters = pickle.load(f)
    
with open("AI4AM_topics3.json", "r+") as f:
    topics = json.load(f)
    
with open('optimal_clusters.pkl', 'rb') as f:
    optimal_clusters = pickle.load(f)
    
list_topics = list()
for k,v in  topics.items():
    list_topics.append(f"{k} : {v['description']}")

In [3]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from utils import OPENAI_API_KEY

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=OPENAI_API_KEY)

In [4]:
from langchain_core.prompts import PromptTemplate

topics_estimate = PromptTemplate.from_template("""
You are an expert in analyzing scientific papers.
You have received a list of titles of research and study topics:
###
{list_topics}
###
Your tasks:
Step 1. Rate on a 10-point scale how consistent this list of topics is,
i.e. how well the topics can be grouped into one common theme or cluster.
Where 0 points have absolutely nothing in common, 10 points are almost identical.

Step 2. Give this list a common name or topic title, a summary.

The output should be in JSON format as a dictionary with the following key-value pair:
The key is the short name of these topics. The values is a score from 0 to 10 indicating how consistent this list of topics is.
Example of output:
"Advanced Computational Techniques in Materials and Manufacturing": 7,
....
               """
                                              )

In [5]:
import numpy as np

optimal_clusters.keys()

dict_keys(['gauss_OpenAI', 'kmeans_OpenAI', 'gauss_HuggingFace', 'kmeans_HuggingFace', 'hdbscan_OpenAI', 'hdbscan_HuggingFace', 'hdbscan_HuggingFace + gauss_HF', 'hdbscan_HuggingFace + kmeans_HF'])

### LLM estimation of clustering algorithms and name of clusters

In [6]:
from langchain_core.output_parsers import JsonOutputParser, SimpleJsonOutputParser

# Summary of topics.
title_sum = topics_estimate | llm | JsonOutputParser()
#title_sum = topics_estimate | llm | SimpleJsonOutputParser()

In [7]:
def get_topic_list(topics_names, cl_labels, cluster):
    topic_list = list()
    for t, l in zip(topics_names, cl_labels):
        if l == cluster:
            topic_list.append(f"{t}")
    return topic_list

def remove_topic(ltopics, rtopics):
    for t in rtopics:
        if t in ltopics:
            ltopics.remove(t)
    return

def get_outliers(outliers):
    _outliers = [s.split(':') for s in outliers]
    for  i,o in enumerate(_outliers):
        if len(o) !=2:
            _outliers[i] = o[0], " "
    return {k: {"score":10,
                "list": [f"{k}:{v}"]
                } for k,v in _outliers}

def get_answer(model, text):
    pass

In [120]:
def print_dict(d, indent=0):
    """
    Recursively prints a dictionary, including nested dictionaries, 
    with indentation to represent structure.
    """
    for key, value in d.items():
        print('   ' * indent + str(key) + ": ", end="\n")
        if isinstance(value, dict):
            print()  # Print a newline for nested dictionaries
            print_dict(value, indent + 1)
        else:
            print(" ",str(value))

In [8]:
clusters_topic = dict()

for  k, v in optimal_clusters.items():
    print("Cluster Algorithm:",k)
    _topics = dict()
    for l in np. unique(v['labels']):
        print("Cluster:", l)
        _list = get_topic_list(list_topics, optimal_clusters[k]['labels'], cluster=l)
        if l != -1:
            answer = title_sum.invoke({"list_topics": _list})
            _k = list(answer.keys())[0]
            _topics[_k] = {"score":answer[_k],
                           "list":_list}
            print(answer, "\n")
            print("\n".join(_list))
        else:
            print("Outlaier Topics:\n", "\n".join(_list))
            _topics.update(get_outliers(_list))
        print(20*'-')
    print(10*'-*-')
    clusters_topic[k] = _topics

Cluster Algorithm: gauss_OpenAI
Cluster: 0
{'Quantum Computing and Computational Methods in Quantum Mechanics': 8} 

Quantum Computing Techniques : A technique in quantum mechanics that allows for the adiabatic evolution of quantum states, enhancing the performance of quantum algorithms by reducing errors.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently than classical computers, including variational quantum algorithms.
Density Functional Theory (DFT) : A quantum mechanical method used to investigate the electronic structure of many-body systems, particularly atoms, molecules, and the condensed phases.
Self-Consistent Field (SCF) : A computational method used in quantum chemistry to find the approximate solution of the Schrödinger equation for many-electron systems.
Quantum Circuit Optimization : The process of improving the efficiency and performance of quant

{'Machine Learning and Its Subfields': 9} 

Deep Learning : A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.
Machine Learning : A subset of artificial intelligence that involves the use of algorithms and statistical models to enable computers to perform tasks without explicit instructions.
Explainable Artificial Intelligence (XAI) : A subfield of artificial intelligence focused on making the results of machine learning models understandable to humans, often through techniques that explain how models make predictions.
Active Learning Algorithm (ALA) : A machine learning approach that iteratively selects the most informative data points to improve model performance, particularly in the context of materials design.
Reinforcement Learning : A type of machine learning where an agent learns to make decisions by receiving rewards or penalties based on its actions.
Active Learning (

{'Machine Learning Techniques in Data Analysis': 8} 

Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Graph Neural Networks : A type of neural network designed to process data structured as graphs, useful for predicting properties of materials based on their atomic structure.
Convolutional Neural Networks : A class of deep learning algorithms used for analyzing visual imagery, particularly effective in extracting features from data such as X-ray diffraction patterns.
Neural Networks : A computational approach that mimics the way human brains operate, used here to generate potential energy surfaces for modeling reactions.
Graph Neural Networks (GNNs) : A type of neural network designed to work directly with graph-structured data, useful for modeling complex relationships in materials science.
Artificial Neu

{'Advanced Computational Techniques in Materials Science and Machine Learning': 8} 

Synthetic Image Generation : The process of creating synthetic images that mimic real experimental conditions for training machine learning models.
Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Graph Neural Networks : A type of neural network designed to process data structured as graphs, useful for predicting properties of materials based on their atomic structure.
Convolutional Neural Networks : A class of deep learning algorithms used for analyzing visual imagery, particularly effective in extracting features from data such as X-ray diffraction patterns.
Genetic Algorithm (GA) : An optimization technique inspired by the process of natural selection, used to generate novel structures in the context of materials researc

{'Catalytic Processes and Energy Conversion': 8} 

CO2 Reduction : The process of converting carbon dioxide into useful products, such as hydrocarbons, often using catalysts to facilitate the reaction.
Nanoparticles in Catalysis : The study of nanoparticles and their role in enhancing catalytic processes, often leading to increased reaction rates and selectivity.
Catalysis by Clusters : Research focused on the catalytic properties of atomic clusters, which can exhibit unique behaviors compared to bulk materials due to their size and composition.
Photoinduced Reactions : Chemical reactions that are initiated or accelerated by the absorption of light, particularly relevant in the study of surface dynamics.
Catalysis : The process of increasing the rate of a chemical reaction by adding a substance known as a catalyst, which is not consumed in the reaction.
Fuel Cells : Devices that convert chemical energy from fuels into electricity through electrochemical reactions, often using hydrogen 

{'Statistical Mechanics and Computational Methods in Materials Science': 8} 

Statistical Mechanics and Thermodynamics : A branch of theoretical physics that uses statistical methods to explain the behavior of systems with a large number of particles, particularly at finite temperatures, and includes the study of thermodynamic potentials and phase behavior.
Surface Dynamics : Curves that describe how molecules adhere to surfaces at constant temperature, relevant for materials like metal-organic frameworks (MOFs).
Automated Crystal Structure Solution : A hybrid approach combining computational methods and experimental techniques to identify crystal structures efficiently.
Molecular Dynamics : A computer simulation method for analyzing the physical movements of atoms and molecules, allowing for the study of the time-dependent behavior of a molecular system.
Vibrational Spectroscopy : A technique used to study the vibrational modes of molecules, providing insights into molecular structure

{'Quantum and Advanced Materials Science': 8} 

Quantum Materials : Materials that exhibit quantum mechanical properties, which are of interest for their potential applications in advanced technologies such as quantum computing and superconductivity.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently than classical computers, including variational quantum algorithms.
Crystallography : The study of crystals and their structures, including the analysis of crystal formation and properties.
Materials Informatics : An interdisciplinary field that combines materials science and data science to discover, design, and screen new materials using computational methods.
Organic Electronics : A field of study focused on the use of organic materials in electronic devices, including organic semiconductors and organic photovoltaics.
Oxide Electronics : The study and application 

{'Crystal Structure and Material Analysis Techniques': 9} 

Crystal Structure Determination : The process of identifying the arrangement of atoms within a crystal, often using techniques such as X-ray diffraction.
Noise Reduction : The process of removing unwanted disturbances from signals, in this case, electron diffraction patterns, to enhance the quality of the data.
X-ray Diffraction : A technique used to study the structure of materials by observing the scattering of X-rays, providing information about the arrangement of atoms in a crystal.
Strain Profile Analysis : The study of how materials deform under stress, which can be inferred from X-ray diffraction data to understand material properties.
--------------------
-*--*--*--*--*--*--*--*--*--*-
Cluster Algorithm: kmeans_HuggingFace
Cluster: 0
{'Materials Data Science and Interoperability': 8} 

Semantic Interoperability : The ability of different systems to exchange and make use of information in a meaningful way, particularly 

{'Materials Science and Engineering': 8} 

Hydrogen Bond Network : The network of hydrogen bonds that connect water molecules, which plays a crucial role in determining the properties of liquid water.
Noise Reduction : The process of removing unwanted disturbances from signals, in this case, electron diffraction patterns, to enhance the quality of the data.
Fusion Structural Materials : Materials specifically designed to withstand the extreme conditions in fusion reactors, including high radiation doses.
2D Materials : Materials that are one or two atoms thick, which exhibit unique physical and chemical properties.
Graphene : A single layer of carbon atoms arranged in a two-dimensional honeycomb lattice, known for its exceptional electrical, thermal, and mechanical properties.
Band Gap Directness : A property of semiconductors that describes whether the top of the valence band and the bottom of the conduction band occur at the same or different momentum k-wavevectors.
Symmetry in Mater

{'Materials Science and Characterization Techniques': 8} 

Statistical Mechanics and Thermodynamics : A branch of theoretical physics that uses statistical methods to explain the behavior of systems with a large number of particles, particularly at finite temperatures, and includes the study of thermodynamic potentials and phase behavior.
Nanoparticle Imaging : The study of the structural properties and shapes of nanoparticles using advanced imaging techniques, including high resolution transmission electron microscopy.
Vibrational Spectroscopy : A technique used to study the vibrational modes of molecules, providing insights into molecular structure and interactions, particularly in the context of hydrogen bonding in water.
4D Scanning Transmission Electron Microscopy (4D-STEM) : A technique for studying nanoscale materials that combines scanning transmission electron microscopy with electron diffraction patterns to provide local structural imaging.
Raman Spectroscopy : A spectroscopi

{'Nanoparticle and Cluster Catalysis': 8} 

Nanoparticles in Catalysis : The study of nanoparticles and their role in enhancing catalytic processes, often leading to increased reaction rates and selectivity.
Catalysis by Clusters : Research focused on the catalytic properties of atomic clusters, which can exhibit unique behaviors compared to bulk materials due to their size and composition.
Catalysis : The process of increasing the rate of a chemical reaction by adding a substance known as a catalyst, which is not consumed in the reaction.
--------------------
Cluster: 1
{'Quantum Computing and Optimization Techniques': 9} 

Quantum Computing Techniques : A technique in quantum mechanics that allows for the adiabatic evolution of quantum states, enhancing the performance of quantum algorithms by reducing errors.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently 

{'Advanced Machine Learning Techniques': 9} 

Deep Learning : A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.
Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Machine Learning : A subset of artificial intelligence that involves the use of algorithms and statistical models to enable computers to perform tasks without explicit instructions.
Explainable Artificial Intelligence (XAI) : A subfield of artificial intelligence focused on making the results of machine learning models understandable to humans, often through techniques that explain how models make predictions.
Graph Neural Networks : A type of neural network designed to process data structured as graphs, useful for predicting properties of

{'Image Processing Techniques for Machine Learning': 8} 

Synthetic Image Generation : The process of creating synthetic images that mimic real experimental conditions for training machine learning models.
Image Denoising : Techniques used to remove noise from images, improving their clarity and fidelity for better analysis.
Image Quality Assessment : The evaluation of the quality of images, often in terms of how closely they resemble the original or ideal images, using various metrics and methodologies.
Image Analysis : The process of examining and interpreting images to extract meaningful information, often using computational techniques.
--------------------
-*--*--*--*--*--*--*--*--*--*-
Cluster Algorithm: hdbscan_HuggingFace + gauss_HF
Cluster: -1
Outlaier Topics:
 Anomaly Detection in EEG : The identification of unusual patterns or outliers in EEG data, which can indicate potential issues or abnormalities in brain activity.
Optical Anisotropy : The property of a material to have 

{'Advanced Computational Techniques in Materials Science and Neuroscience': 8} 

Statistical Mechanics and Thermodynamics : A branch of theoretical physics that uses statistical methods to explain the behavior of systems with a large number of particles, particularly at finite temperatures, and includes the study of thermodynamic potentials and phase behavior.
Machine Learning in Chemistry : The application of machine learning techniques to predict material properties and enhance computational chemistry methods, including interatomic interaction models.
Neuroscience Applications : The application of advanced computational techniques to understand brain activity and improve diagnostics and neurorehabilitation methods.
Vibrational Spectroscopy : A technique used to study the vibrational modes of molecules, providing insights into molecular structure and interactions, particularly in the context of hydrogen bonding in water.
Computational Materials Science : The use of computational metho

{'Materials and Technologies for Energy Conversion and Storage': 8} 

Tunnel Magnetoresistive (TMR) Sensors : Devices that measure the change in electric resistance in magnetic materials, used for sensing magnetic fields.
CO2 Reduction : The process of converting carbon dioxide into useful products, such as hydrocarbons, often using catalysts to facilitate the reaction.
Electrochemical Sensors : Devices that use electrochemical reactions to detect and measure the concentration of specific ions or molecules in a solution.
Metal-Organic Frameworks (MOFs) : Porous materials composed of metal ions coordinated to organic ligands, used in sensing applications for their ability to selectively interact with different ions.
Semiconductors : Materials that have electrical conductivity between that of a conductor and an insulator, crucial for electronic devices and applications.
Solid Electrolyte Interphase (SEI) : A layer that forms on the surface of electrodes in batteries, crucial for understa

{'Machine Learning and Its Advanced Techniques': 9} 

Deep Learning : A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.
Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Machine Learning : A subset of artificial intelligence that involves the use of algorithms and statistical models to enable computers to perform tasks without explicit instructions.
Explainable Artificial Intelligence (XAI) : A subfield of artificial intelligence focused on making the results of machine learning models understandable to humans, often through techniques that explain how models make predictions.
Active Learning Algorithm (ALA) : A machine learning approach that iteratively selects the most informative data points to 

{'Quantum Computing and Simulation Techniques': 9} 

Quantum Computing Techniques : A technique in quantum mechanics that allows for the adiabatic evolution of quantum states, enhancing the performance of quantum algorithms by reducing errors.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently than classical computers, including variational quantum algorithms.
Molecular Dynamics : A computer simulation method for analyzing the physical movements of atoms and molecules, allowing for the study of the time-dependent behavior of a molecular system.
Quantum State Tomography : A process used to reconstruct the quantum state of a system based on measurement data, crucial for understanding quantum systems.
Density Functional Theory (DFT) : A quantum mechanical method used to investigate the electronic structure of many-body systems, particularly atoms, molecules, and the

In [121]:
_km_HF = {k:[s.split(":")[0] for s in  v["list"]] for k,v in sorted(clusters_topic['kmeans_HuggingFace'].items(),
                                       key=lambda item: len(item[1]['list']), reverse=True)}
print_dict(_km_HF)

Machine Learning and Deep Learning Techniques: 
  ['Deep Learning ', 'Generative Adversarial Networks (GANs) ', 'Machine Learning ', 'Explainable Artificial Intelligence (XAI) ', 'Graph Neural Networks ', 'Convolutional Neural Networks ', 'Active Learning Algorithm (ALA) ', 'Reinforcement Learning ', 'Graph Neural Networks (GNNs) ', 'Active Learning (AL) ', 'Symbolic Regression ', 'Physically-Informed Machine Learning ', 'Artificial Neural Networks ', 'Active Learning ', 'Bayesian Optimization ', 'Multi-modal learning ', 'Transferability in Machine Learning ']
Advanced Computational Techniques in Materials Science and Neuroscience: 
  ['Machine Learning in Chemistry ', 'Neuroscience Applications ', 'Automated Crystal Structure Solution ', 'Molecular Dynamics ', 'Micromagnetic Simulations ', 'Gaussian Approximation Potentials ', 'Digital Twins ', 'Finite-Element Simulations ', 'Machine Learning Potential (MLP) ', 'Inverse Materials Design ', 'First Principles Calculations ', 'Simulation

In [12]:
with open("AI4AM_topics_clusters.json", "w+") as f:
    json.dump(clusters_topic, f)

#### Estimate a quality of clustering

In [67]:
for k,v in clusters_topic.items():
    print(k, 'mean score:', np.mean([s['score'] for _,s in v.items()]), 'N clusters:', len(v))

gauss_OpenAI mean score: 7.916666666666667 N clusters: 12
kmeans_OpenAI mean score: 8.25 N clusters: 12
gauss_HuggingFace mean score: 8.083333333333334 N clusters: 12
kmeans_HuggingFace mean score: 8.0 N clusters: 10
hdbscan_OpenAI mean score: 9.835820895522389 N clusters: 67
hdbscan_HuggingFace mean score: 9.555555555555555 N clusters: 9
hdbscan_HuggingFace + gauss_HF mean score: 8.894736842105264 N clusters: 19
hdbscan_HuggingFace + kmeans_HF mean score: 8.894736842105264 N clusters: 19


In [66]:
def extended_clustering_quality_metric(consensus_scores, num_clusters, k_opt, alpha=1, beta=1):
    """
    Calculate the extended quality of clustering based on consensus scores, number of clusters, 
    and deviation from the optimal number of clusters.

    Parameters:
    - consensus_scores (list or np.ndarray): Consistency scores of clusters (0-10 scale).
    - num_clusters (int): Number of clusters.
    - k_opt (int): Optimal number of clusters.
    - alpha (float): Penalization factor for number of clusters (default=1).
    - beta (float): Penalization factor for deviation from optimal clusters (default=1).

    Returns:
    - float: Extended quality metric score.
    """
    # Average consensus score
    avg_consensus = np.mean(consensus_scores)
    
    # Deviation penalty
    deviation_penalty = 1 + beta * abs(num_clusters - k_opt)
    
    # Calculate quality metric
    quality = avg_consensus / (num_clusters ** alpha * deviation_penalty)
    
    return quality

In [65]:
for k,v in clusters_topic.items():
    print(k, ':', extended_clustering_quality_metric(np.mean([s['score'] for _,s in v.items()]), 
                                                     len(v), 12, beta=2))

gauss_OpenAI : 0.6597222222222222
kmeans_OpenAI : 0.6875
gauss_HuggingFace : 0.6736111111111112
kmeans_HuggingFace : 0.16
hdbscan_OpenAI : 0.0013225522247576158
hdbscan_HuggingFace : 0.15167548500881833
hdbscan_HuggingFace + gauss_HF : 0.031209602954755312
hdbscan_HuggingFace + kmeans_HF : 0.031209602954755312


### LLM estimation of clustering, name of clusters and re-clustering to increaase quality.

In [22]:
from langchain_core.prompts import PromptTemplate

topics_estimate = PromptTemplate.from_template("""
You are an expert in analyzing scientific papers.
You have received a list of titles of research and study topics:
###
{list_topics}
###
Your tasks:
Step 1. Rate on a scale of 1 to 10 how consistent this list of topics is,
i.e. how well the topics can be grouped into one common theme or cluster.
Where 0s have absolutely nothing in common, 10s are nearly identical.

Step 2. If the score from step 1 is below 8, 
exclude topics from the list that you think are not consistent for the list. 
If there are no such topics, or the score is above 7, do not exclude anything.

Step 3. Give this list a general name or topic title, summary name of the cluster.

The output should be in JSON format as a dictionary with the following key-value pair:
The key is the summary name. The value is a dictionary 'score' is a score from 0 to 10 indicating how consistent this list of topics is,
and 'excluded' a list of excluded topics.

Example of output:
"Advanced Computational Techniques in Materials and Manufacturing": ("score": 8,
"excluded": []),
'Advanced Computational Techniques in Quantum and Neuroscience Applications':("score": 7,
"excluded": ["Synthetic Image Generation : The process of creating synthetic images that mimic real experimental conditions for training machine learning models."]),
'Magnetic and Optical Properties of Materials': ("score": 6, 
"excluded": ["Anomaly Detection in EEG : The identification of unusual patterns or outliers in EEG data, which can indicate potential issues or abnormalities in brain activity."]),
...
               """
                                              )

In [31]:
title_sum = topics_estimate | ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=OPENAI_API_KEY) | JsonOutputParser()

In [32]:
clusters_topic = dict()

for  k, v in optimal_clusters.items():
    print("Cluster Algorithm:",k)
    _topics = dict()
    _outliers = list()
    for l in np. unique(v["labels"]):
        print("Cluster:", l)
        _list = get_topic_list(list_topics, optimal_clusters[k]['labels'], cluster=l)
        if l != -1:
            answer = title_sum.invoke({"list_topics": _list})
            try:
                _k = list(answer.keys())[0]
            except:
                answer = title_sum.invoke({"list_topics": _list})
                _k = list(answer.keys())[0]
            _outliers += answer[_k]["excluded"]
            remove_topic(_list, answer[_k]["excluded"])
            _topics[_k] = {"score":answer[_k]["score"],
                           "list":_list}
            print(answer, "\n")
            print("\n".join(_list))
        else:
            print("Outlaier Topics:\n", "\n".join(_list))
            _topics.update(get_outliers(_list))
        print(20*'-')
    _topics.update(get_outliers(_outliers))
    print(10*'-*-')
    clusters_topic[k] = _topics

Cluster Algorithm: gauss_OpenAI
Cluster: 0
{'Quantum Computing and Quantum Chemistry Techniques': {'score': 8, 'excluded': []}} 

Quantum Computing Techniques : A technique in quantum mechanics that allows for the adiabatic evolution of quantum states, enhancing the performance of quantum algorithms by reducing errors.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently than classical computers, including variational quantum algorithms.
Density Functional Theory (DFT) : A quantum mechanical method used to investigate the electronic structure of many-body systems, particularly atoms, molecules, and the condensed phases.
Self-Consistent Field (SCF) : A computational method used in quantum chemistry to find the approximate solution of the Schrödinger equation for many-electron systems.
Quantum Circuit Optimization : The process of improving the efficiency and perform

{'Advanced Machine Learning Techniques': {'score': 9, 'excluded': []}} 

Deep Learning : A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.
Machine Learning : A subset of artificial intelligence that involves the use of algorithms and statistical models to enable computers to perform tasks without explicit instructions.
Explainable Artificial Intelligence (XAI) : A subfield of artificial intelligence focused on making the results of machine learning models understandable to humans, often through techniques that explain how models make predictions.
Active Learning Algorithm (ALA) : A machine learning approach that iteratively selects the most informative data points to improve model performance, particularly in the context of materials design.
Reinforcement Learning : A type of machine learning where an agent learns to make decisions by receiving rewards or penalties based on i

{'Neural Network Applications in Data Analysis and Modeling': {'score': 9, 'excluded': []}} 

Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Graph Neural Networks : A type of neural network designed to process data structured as graphs, useful for predicting properties of materials based on their atomic structure.
Convolutional Neural Networks : A class of deep learning algorithms used for analyzing visual imagery, particularly effective in extracting features from data such as X-ray diffraction patterns.
Neural Networks : A computational approach that mimics the way human brains operate, used here to generate potential energy surfaces for modeling reactions.
Graph Neural Networks (GNNs) : A type of neural network designed to work directly with graph-structured data, useful for modeling complex relationsh

{'Advanced Computational Techniques in Materials Science and Machine Learning': {'score': 9, 'excluded': []}} 

Synthetic Image Generation : The process of creating synthetic images that mimic real experimental conditions for training machine learning models.
Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Graph Neural Networks : A type of neural network designed to process data structured as graphs, useful for predicting properties of materials based on their atomic structure.
Convolutional Neural Networks : A class of deep learning algorithms used for analyzing visual imagery, particularly effective in extracting features from data such as X-ray diffraction patterns.
Genetic Algorithm (GA) : An optimization technique inspired by the process of natural selection, used to generate novel structures in the c

{'Catalysis and Energy Conversion': {'score': 8, 'excluded': []}} 

CO2 Reduction : The process of converting carbon dioxide into useful products, such as hydrocarbons, often using catalysts to facilitate the reaction.
Nanoparticles in Catalysis : The study of nanoparticles and their role in enhancing catalytic processes, often leading to increased reaction rates and selectivity.
Catalysis by Clusters : Research focused on the catalytic properties of atomic clusters, which can exhibit unique behaviors compared to bulk materials due to their size and composition.
Photoinduced Reactions : Chemical reactions that are initiated or accelerated by the absorption of light, particularly relevant in the study of surface dynamics.
Catalysis : The process of increasing the rate of a chemical reaction by adding a substance known as a catalyst, which is not consumed in the reaction.
Fuel Cells : Devices that convert chemical energy from fuels into electricity through electrochemical reactions, ofte

{'Computational and Theoretical Methods in Materials Science and Physics': {'score': 9, 'excluded': []}} 

Statistical Mechanics and Thermodynamics : A branch of theoretical physics that uses statistical methods to explain the behavior of systems with a large number of particles, particularly at finite temperatures, and includes the study of thermodynamic potentials and phase behavior.
Surface Dynamics : Curves that describe how molecules adhere to surfaces at constant temperature, relevant for materials like metal-organic frameworks (MOFs).
Automated Crystal Structure Solution : A hybrid approach combining computational methods and experimental techniques to identify crystal structures efficiently.
Molecular Dynamics : A computer simulation method for analyzing the physical movements of atoms and molecules, allowing for the study of the time-dependent behavior of a molecular system.
Vibrational Spectroscopy : A technique used to study the vibrational modes of molecules, providing insi

{'Advanced Materials and Quantum Technologies': {'score': 8, 'excluded': []}} 

Quantum Materials : Materials that exhibit quantum mechanical properties, which are of interest for their potential applications in advanced technologies such as quantum computing and superconductivity.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently than classical computers, including variational quantum algorithms.
Crystallography : The study of crystals and their structures, including the analysis of crystal formation and properties.
Materials Informatics : An interdisciplinary field that combines materials science and data science to discover, design, and screen new materials using computational methods.
Organic Electronics : A field of study focused on the use of organic materials in electronic devices, including organic semiconductors and organic photovoltaics.
Oxide Electron

{'X-ray and Electron Diffraction Techniques in Material Analysis': {'score': 9, 'excluded': []}} 

Crystal Structure Determination : The process of identifying the arrangement of atoms within a crystal, often using techniques such as X-ray diffraction.
Noise Reduction : The process of removing unwanted disturbances from signals, in this case, electron diffraction patterns, to enhance the quality of the data.
X-ray Diffraction : A technique used to study the structure of materials by observing the scattering of X-rays, providing information about the arrangement of atoms in a crystal.
Strain Profile Analysis : The study of how materials deform under stress, which can be inferred from X-ray diffraction data to understand material properties.
--------------------
-*--*--*--*--*--*--*--*--*--*-
Cluster Algorithm: kmeans_HuggingFace
Cluster: 0
{'Data-Driven Approaches in Materials Science': {'score': 9, 'excluded': []}} 

Semantic Interoperability : The ability of different systems to excha

{'Advanced Computational Techniques in Materials Science and Engineering': {'score': 8, 'excluded': []}} 

Machine Learning in Chemistry : The application of machine learning techniques to predict material properties and enhance computational chemistry methods, including interatomic interaction models.
Neuroscience Applications : The application of advanced computational techniques to understand brain activity and improve diagnostics and neurorehabilitation methods.
Automated Crystal Structure Solution : A hybrid approach combining computational methods and experimental techniques to identify crystal structures efficiently.
Molecular Dynamics : A computer simulation method for analyzing the physical movements of atoms and molecules, allowing for the study of the time-dependent behavior of a molecular system.
Micromagnetic Simulations : Computational techniques used to model the magnetic behavior of materials at the nanoscale.
Gaussian Approximation Potentials : A type of machine learni

{'Advanced Materials and Electrochemical Applications': {'score': 8, 'excluded': []}} 

Surface Dynamics : Curves that describe how molecules adhere to surfaces at constant temperature, relevant for materials like metal-organic frameworks (MOFs).
CO2 Reduction : The process of converting carbon dioxide into useful products, such as hydrocarbons, often using catalysts to facilitate the reaction.
Electrochemical Sensors : Devices that use electrochemical reactions to detect and measure the concentration of specific ions or molecules in a solution.
Metal-Organic Frameworks (MOFs) : Porous materials composed of metal ions coordinated to organic ligands, used in sensing applications for their ability to selectively interact with different ions.
Solid Electrolyte Interphase (SEI) : A layer that forms on the surface of electrodes in batteries, crucial for understanding battery performance and stability.
Sodium-ion batteries (NIBs) : A type of rechargeable battery that uses sodium ions as the 

{'Catalysis and Nanostructures': {'score': 9, 'excluded': []}} 

Nanoparticles in Catalysis : The study of nanoparticles and their role in enhancing catalytic processes, often leading to increased reaction rates and selectivity.
Catalysis by Clusters : Research focused on the catalytic properties of atomic clusters, which can exhibit unique behaviors compared to bulk materials due to their size and composition.
Catalysis : The process of increasing the rate of a chemical reaction by adding a substance known as a catalyst, which is not consumed in the reaction.
--------------------
Cluster: 1
{'Quantum Computing and Optimization Techniques': {'score': 9, 'excluded': []}} 

Quantum Computing Techniques : A technique in quantum mechanics that allows for the adiabatic evolution of quantum states, enhancing the performance of quantum algorithms by reducing errors.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mec

{'Advanced Machine Learning Techniques': {'score': 9, 'excluded': []}} 

Deep Learning : A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.
Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Machine Learning : A subset of artificial intelligence that involves the use of algorithms and statistical models to enable computers to perform tasks without explicit instructions.
Explainable Artificial Intelligence (XAI) : A subfield of artificial intelligence focused on making the results of machine learning models understandable to humans, often through techniques that explain how models make predictions.
Graph Neural Networks : A type of neural network designed to process data structured as graphs, useful f

{'Image Processing and Analysis Techniques': {'score': 9, 'excluded': []}} 

Synthetic Image Generation : The process of creating synthetic images that mimic real experimental conditions for training machine learning models.
Image Denoising : Techniques used to remove noise from images, improving their clarity and fidelity for better analysis.
Image Quality Assessment : The evaluation of the quality of images, often in terms of how closely they resemble the original or ideal images, using various metrics and methodologies.
Image Analysis : The process of examining and interpreting images to extract meaningful information, often using computational techniques.
--------------------
-*--*--*--*--*--*--*--*--*--*-
Cluster Algorithm: hdbscan_HuggingFace + gauss_HF
Cluster: -1
Outlaier Topics:
 Anomaly Detection in EEG : The identification of unusual patterns or outliers in EEG data, which can indicate potential issues or abnormalities in brain activity.
Optical Anisotropy : The property of 

{'Advanced Computational Techniques in Materials Science and Chemistry': {'score': 6, 'excluded': ['Neuroscience Applications : The application of advanced computational techniques to understand brain activity and improve diagnostics and neurorehabilitation methods.', 'Vibrational Spectroscopy : A technique used to study the vibrational modes of molecules, providing insights into molecular structure and interactions, particularly in the context of hydrogen bonding in water.', 'Semantic Interoperability : The ability of different systems to exchange and make use of information in a meaningful way, particularly in the context of materials data.']}} 

Statistical Mechanics and Thermodynamics : A branch of theoretical physics that uses statistical methods to explain the behavior of systems with a large number of particles, particularly at finite temperatures, and includes the study of thermodynamic potentials and phase behavior.
Machine Learning in Chemistry : The application of machine le

{'Advanced Materials and Energy Storage Technologies': {'score': 7, 'excluded': ['CO2 Reduction : The process of converting carbon dioxide into useful products, such as hydrocarbons, often using catalysts to facilitate the reaction.', 'Fuel Cells : Devices that convert chemical energy from fuels into electricity through electrochemical reactions, often using hydrogen as a fuel source.']}} 

Tunnel Magnetoresistive (TMR) Sensors : Devices that measure the change in electric resistance in magnetic materials, used for sensing magnetic fields.
Electrochemical Sensors : Devices that use electrochemical reactions to detect and measure the concentration of specific ions or molecules in a solution.
Metal-Organic Frameworks (MOFs) : Porous materials composed of metal ions coordinated to organic ligands, used in sensing applications for their ability to selectively interact with different ions.
Semiconductors : Materials that have electrical conductivity between that of a conductor and an insula

{'Advanced Machine Learning Techniques': {'score': 9, 'excluded': []}} 

Deep Learning : A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.
Generative Adversarial Networks (GANs) : A class of machine learning frameworks where two neural networks, a generator and a discriminator, compete against each other to generate new data instances that resemble the training data.
Machine Learning : A subset of artificial intelligence that involves the use of algorithms and statistical models to enable computers to perform tasks without explicit instructions.
Explainable Artificial Intelligence (XAI) : A subfield of artificial intelligence focused on making the results of machine learning models understandable to humans, often through techniques that explain how models make predictions.
Active Learning Algorithm (ALA) : A machine learning approach that iteratively selects the most informat

{'Advanced Quantum Computing and Simulation Techniques': {'score': 8, 'excluded': []}} 

Quantum Computing Techniques : A technique in quantum mechanics that allows for the adiabatic evolution of quantum states, enhancing the performance of quantum algorithms by reducing errors.
Quantum Computing : A field of study focused on the development and application of quantum computers, which leverage quantum mechanics to perform computations more efficiently than classical computers, including variational quantum algorithms.
Molecular Dynamics : A computer simulation method for analyzing the physical movements of atoms and molecules, allowing for the study of the time-dependent behavior of a molecular system.
Quantum State Tomography : A process used to reconstruct the quantum state of a system based on measurement data, crucial for understanding quantum systems.
Density Functional Theory (DFT) : A quantum mechanical method used to investigate the electronic structure of many-body systems, pa

In [124]:
_km_HF = {k:[s.split(":")[0] for s in  v["list"]] for k,v in sorted(clusters_topic['kmeans_HuggingFace'].items(),
                                       key=lambda item: len(item[1]['list']), reverse=True)}
print('Number of clusters:',len( _km_HF))
print_dict(_km_HF)

Number of clusters: 18
Advanced Machine Learning Techniques: 
  ['Deep Learning ', 'Generative Adversarial Networks (GANs) ', 'Machine Learning ', 'Explainable Artificial Intelligence (XAI) ', 'Graph Neural Networks ', 'Convolutional Neural Networks ', 'Active Learning Algorithm (ALA) ', 'Reinforcement Learning ', 'Graph Neural Networks (GNNs) ', 'Active Learning (AL) ', 'Symbolic Regression ', 'Physically-Informed Machine Learning ', 'Artificial Neural Networks ', 'Active Learning ', 'Bayesian Optimization ', 'Multi-modal learning ', 'Transferability in Machine Learning ']
Advanced Materials and Catalysis: 
  ['Hydrogen Bond Network ', 'Noise Reduction ', 'Fusion Structural Materials ', '2D Materials ', 'Graphene ', 'Band Gap Directness ', 'Symmetry in Materials ', 'Strain Profile Analysis ', 'High-Entropy Alloys ', 'Nanoparticles in Catalysis ', 'Catalysis by Clusters ', 'Diffusion Barriers ', 'Catalysis ', '3D Printing ', 'Point Defects ', 'Additive Manufacturing ']
Advanced Computa

In [40]:
with open("AI4AM_topics_clusters2.json", "w+") as f:
    json.dump(clusters_topic, f)

#### Estimate a quality of clustering

In [60]:
for k,v in clusters_topic.items():
    print(k, 'mean score:', np.mean([s['score'] for _,s in v.items()]), 'N clusters:', len(v))

gauss_OpenAI mean score: 8.7 N clusters: 20
kmeans_OpenAI mean score: 8.384615384615385 N clusters: 13
gauss_HuggingFace mean score: 8.947368421052632 N clusters: 19
kmeans_HuggingFace mean score: 8.61111111111111 N clusters: 18
hdbscan_OpenAI mean score: 9.814285714285715 N clusters: 70
hdbscan_HuggingFace mean score: 9.545454545454545 N clusters: 11
hdbscan_HuggingFace + gauss_HF mean score: 9.153846153846153 N clusters: 26
hdbscan_HuggingFace + kmeans_HF mean score: 9.16 N clusters: 25


In [72]:
for k,v in clusters_topic.items():
    print(k, ':', extended_clustering_quality_metric(np.mean([s['score'] for _,s in v.items()]), 
                                                     len(v), 12, beta=1))

gauss_OpenAI : 0.04833333333333333
kmeans_OpenAI : 0.32248520710059175
gauss_HuggingFace : 0.05886426592797784
kmeans_HuggingFace : 0.068342151675485
hdbscan_OpenAI : 0.0023763403666551366
hdbscan_HuggingFace : 0.43388429752066116
hdbscan_HuggingFace + gauss_HF : 0.023471400394477315
hdbscan_HuggingFace + kmeans_HF : 0.02617142857142857


### Compare with direct LLM clusterring methods, not use embeddings.

In [97]:
import networkx as nx

# Load 
file_path = "graph_AI4AM_topics_v1.gexf"  
graph = nx.read_gexf(file_path)

# Convert the graph into a dictionary
graph_dict = {
    "nodes": {n: data for n, data in graph.nodes(data=True)},
    "edges": [
        {"source": u, "target": v, **data} for u, v, data in graph.edges(data=True)
    ],
}


In [99]:
topics_nodes = [k for k,v in topics_dict["nodes"].items() if v["description"] != "pdf doc"]

topics_clusters = {e["source"]: [] for e in topics_dict["edges"] if e["source"] in topics_nodes}

for  k in topics_clusters.keys():
    topics_clusters[k] = [e["target"] for e in topics_dict["edges"] if e["source"] == k and e['target'] in topics_nodes]
    

llm_clusters = {k:v for k,v in topics_clusters.items()  if len(v) > 0}

llm_clusters = {k:v for k,v in sorted(llm_clusters.items()  , key=lambda item: len(item[1]), reverse=True)}

In [122]:
print('Number of clusters:',len(llm_clusters))
print_dict(llm_clusters)

Number of clusters: 24
Machine Learning and Data Science: 
  ['Machine Learning in Chemistry', 'Machine Learning', 'Generative Adversarial Networks (GANs)', 'Deep Learning', 'Active Learning (AL)', 'Active Learning Algorithm (ALA)', 'Transferability in Machine Learning', 'Data Analysis', 'Data Acquisition and Processing', 'Data Curation Strategies', 'Synthetic Image Generation', 'Explainable Artificial Intelligence (XAI)', 'Adaptive Algorithms', 'Graph Neural Networks', 'Convolutional Neural Networks', 'Neural Networks', 'Graph Neural Networks (GNNs)', 'Machine Learning Interatomic Potentials (MLIPs)', 'Active Learning', 'Bayesian Optimization', 'Physically-Informed Machine Learning', 'Multi-modal learning', 'Large Language Models', 'Artificial Neural Networks', 'Genetic Algorithm (GA)', 'Reinforcement Learning', 'Generative Models']
Computational Methods and Simulations: 
  ['Molecular Dynamics', 'Density Functional Theory (DFT)', 'First Principles Calculations', 'Kinetic Monte Carlo 

- I consider that LLM clustering  is better than k-means clustering or any  other clustering  methods applied to embbedings of toopics.