In [1]:
import json

In [2]:
def print_dict(d, indent=0):
    """
    Recursively prints a dictionary, including nested dictionaries, 
    with indentation to represent structure.
    """
    for key, value in d.items():
        print('  ' * indent + str(key) + ": ", end="")
        if isinstance(value, dict):
            print()  # Print a newline for nested dictionaries
            print_dict(value, indent + 1)
        else:
            print(str(value))

In [3]:
def get_decription(topic, topic_dict, llm):
    
    
    des = topic_dict.get(topic, None)
    if des is not None:
        return des
    else:
        des = llm.invoke({ "topic": topic })
        return des[topic]

In [24]:
with open('./data/data_rag/conference/conference.json', 'r') as f: 
    meta_data = json.load(f) 

In [5]:
with open('AI4AM_topics2.json', 'r') as f: 
    topics = json.load(f) 

In [25]:
topic_dict = {k: v['description'] for k, v in topics.items()}

for doc in meta_data:
    if doc.get('title', None):
        for t, v in topics.items():
            for d in v['metadata']:
                if d['source'] == doc['source']:
                    doc['topics'] = {**doc.get('topics', {}), **{t:  topic_dict.get(t)}}
                        

In [26]:
doc

{'source': './data/data_rag/conference/89_AI4AM2024_Botti.pdf',
 'page': 0,
 'title': 'The future is here: accelerating computational materials science with machine learning',
 'authors': {'Silvana Botti': ['Research Center Future Energy Materials and Systems, Faculty of Physics and Astronomy and ICAMS, Ruhr University Bochum, Germany']},
 'topics': {'Machine Learning in Materials Science': 'A subset of machine learning that uses neural networks with many layers to analyze and interpret complex data, particularly effective for image processing tasks.',
  'Computational Materials Science': 'The study of materials using computational methods to predict their properties and behaviors.',
  'Density Functional Theory (DFT)': 'A quantum mechanical method used to investigate the electronic structure of many-body systems, applied here to confirm the thermodynamic stability of proposed crystal structures.',
  'Band Gap Engineering': 'The study of how to manipulate the electronic band structure 

In [27]:
with open ('./data/data_rag/conference/conference_authors_topics2.json', 'w') as f: 
    json.dump(meta_data, f, indent=4) 

In [6]:
import os, json
#from langchain.chains import AnalyzeDocumentChain
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_openai import ChatOpenAI

from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 
from langchain_openai import ChatOpenAI
#llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0, top_p=0.2)  

                top_p was transferred to model_kwargs.
                Please confirm that top_p is what you intended.


In [7]:
from prompt_utills import *

In [8]:
topic_description = PromptTemplate.from_template(
            """
            You are an expert in analyzing scientific papers.
            You have received an topic name that is learned in a some scientific paper:
            ###
                {topic}
            ------------- \n
            Your task is to get a short description of the topic.
            
            
            The output format is a list in valid JSON format.
            Where key is the topic name and value is a desciption of the topic.
            
            Example: 
            'Phase Problem in Crystallography': 'A challenge in crystallography that involves determining the phase information of scattered waves, which is essential for reconstructing the electron density of a crystal.'
            
            """
        )

In [9]:
topic_llm = topic_description | llm | JsonOutputParser()

In [17]:
topic_dict = {k: v['description'] for k, v in topics.items()}

for doc in meta_data:
    if doc.get('title', None):
        for t, v in topics.items():
            for d in v['metadata']:
                if d['source'] == doc['source']:
                    for k in d['topic']:
                        if k not in topic_dict:
                            topic_dict[k] = get_decription(k, topic_dict, topic_llm)
                    doc['topics'] = {**doc.get('topics', {}), **{k:  topic_dict.get(k) for k in d['topic']}}
                        

In [11]:
for d in  meta_data[:10]:
    print_dict(d)
    print("--- "*20)

source: ./data/data_rag/conference/45_AI4AM2024_Hakim_AMARA_8.pdf
page: 0
title: Unlocking 3D Nanoparticle Shapes from 2D HRTEM images: A Deep Learning Breakthrough
authors: 
  R. Moreau: ['LEM (ONERA -CNRS), Châtillon, France']
  H. Amara: ['LEM (ONERA -CNRS), Châtillon, France', 'MPQ, Paris, France']
  M. Moreaud: ['IFP Energies Nouvelles, Solaize, France']
  J. Nelayah: ['MPQ, Paris, France']
  A. Moncomble: ['MPQ, Paris, France']
  D. Alloyeau: ['MPQ, Paris, France']
  C. Ricolleau: ['MPQ, Paris, France']
  R. Gatti: ['LEM (ONERA -CNRS), Châtillon, France']
topics: 
  Deep Learning: A subset of machine learning that uses neural networks with many layers to model complex patterns in data, applied here to predict properties of 2D materials.
  High Resolution Transmission Electron Microscopy (HRTEM): A sophisticated imaging technique that utilizes a transmission electron microscope to achieve atomic-scale resolution, allowing for the detailed observation of the internal structure of m

In [18]:
meta_data[-2]

{'source': './data/data_rag/conference/9_AI4AM2024_Dale_Stephen_75.pdf',
 'page': 0,
 'title': 'Transferable diversity – a data-driven representation of chemical space.',
 'authors': {'Stephen G. Dale': ['Institute for Functional Intelligent Materials, National University of Singapore, Block S9, Level 9, 4 Science Drive 2, Singapore 117544'],
  'Tim Gould': ['Queensland Micro- and Nanotechnology Centre, Griffith University, Nathan, Qld 4111, Australia'],
  'Bun Chan': ['Graduate School of Engineering, Nagasaki University, Bunkyo 1-14, Nagasaki 852-8521, Japan'],
  'Stefan Vuckovic': ['Department of Chemistry, University of Fribourg, Chem. du Musée 9, 1700 Fribourg, Switzerland']},
 'topics': {'Transferable Diversity': 'A concept introduced in the text that refers to the ability of training data to maintain its applicability across different chemical contexts, enhancing model generalization.',
  'Machine Learning in Materials Science': 'A subset of machine learning that uses neural netw

In [19]:
with open ('./data/data_rag/conference/conference_authors_topics.json', 'w') as f: 
    json.dump(meta_data, f, indent=4) 