### Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines


In [6]:
#!pip install openai==0.28.1
#!pip install openai --upgrade
#!pip install ragas
#!pip install unstructured
#!pip install langchain[all]
#!pip install --upgrade langchain

#!pip install playwright
#!pip install -U selenium unstructured
#!pip install --upgrade langchain langchain-community langchainhub langchain-openai langchain-chroma bs4

In [1]:
#!pip install pydantic==2.5
#!pip install rapidocr-onnxruntime

In [1]:
#!pip install nltk

In [2]:
import os, json
#from langchain.chains import AnalyzeDocumentChain
from langchain_openai import ChatOpenAI

from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 
from llm_utils import load_pdf_documets
#os.environ["LANGCHAIN_TRACING_V2"] = "true"

#openai.api_key = os.environ['OPENAI_API_KEY']

[nltk_data] Downloading package punkt_tab to /home/oleg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
#from langchain_community.document_loaders import SeleniumURLLoader,  DirectoryLoader, PyPDFLoader
#from langchain.text_splitter import CharacterTextSplitter,  RecursiveCharacterTextSplitter

In [3]:
from langchain_openai import ChatOpenAI
#llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1, top_p=0.2)  

                top_p was transferred to model_kwargs.
                Please confirm that top_p is what you intended.


In [4]:
from prompt_utills import *
from langchain_core.output_parsers import JsonOutputParser

In [5]:
with open("AI4AM_topics3.json", "r+") as f:
    dict_topics = json.load(f)

### Clustering topics 

In [6]:
# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser()

clustering_topics_prompt.partial_variables={"format_instructions": parser.get_format_instructions()}

chain = clustering_topics_prompt | llm | parser 
# Cluster's dictionary
cl_dict = chain.invoke({"topics": {k: v["description"] for k,v in dict_topics.items()},
                        "N": len(dict_topics.keys())//8 +1,
                       "L": len(dict_topics)}
                      )

cl_dict.keys()

dict_keys(['Statistical Mechanics and Thermodynamics', 'Machine Learning and Data Science', 'Computational Methods and Simulations', 'Quantum Computing and Quantum Materials', 'Materials Characterization Techniques', 'Nanotechnology and Nanomaterials', 'Electrochemical and Energy Materials', 'Magnetic Materials and Properties', 'Chemical Reactions and Catalysis', 'Surface Science and Dynamics', 'Neuroscience and Brain Activity', 'Materials Informatics and Design', 'Advanced Manufacturing Techniques', 'Optoelectronics and Photonics'])

In [8]:
cl_topics = []
for cl in cl_dict.keys():
    cl_topics += [list(n.keys())[0] for n in cl_dict[cl]['nodes']] # All topics in all clusters

    
diff_topics = set(dict_topics).difference(set(cl_topics)) # Topics that are not any cluster
#len(diff_topics)
max_it = 4

while len(diff_topics) and (max_it >0):
    print(f"Number of missing topics: {len(diff_topics)}, All topics: {len(cl_topics)}, Clusters: {len(cl_dict)}")
    # Clustering missing topics
    
    clustering_topics_add.partial_variables={"format_instructions": parser.get_format_instructions()}

    chain = clustering_topics_add | llm | parser 
    # New cluster's dictionary
    new_dict = chain.invoke({"topics": {k: v['description'] for k,v in dict_topics.items() if k in diff_topics},
                        "cluster": {k: v['description'] for k,v in cl_dict.items()},
                       }
                      )
    # Adding to the original cluster's dictionary
    for cl,v in new_dict.items():
        if cl not in cl_dict:
            cl_dict[cl] = new_dict[cl]
        else:
            cl_dict[cl]['nodes'] += new_dict[cl]['nodes']
            
    cl_topics = []
    for cl in cl_dict.keys():
        cl_topics += [list(n.keys())[0] for n in cl_dict[cl]['nodes']] # All topics in all clusters
        
    diff_topics = set(dict_topics).difference(set(cl_topics)) # Topics that are not any cluster
    print(f"Number of missing topics: {len(diff_topics)}, Clusters: {len(cl_dict)}")
    max_it -= 1

Number of missing topics: 48, All topics: 74, Clusters: 14
Number of missing topics: 22, Clusters: 16
Number of missing topics: 22, All topics: 100, Clusters: 16
Number of missing topics: 5, Clusters: 22
Number of missing topics: 5, All topics: 117, Clusters: 22
Number of missing topics: 0, Clusters: 24


In [9]:
cl_dict.keys()

dict_keys(['Statistical Mechanics and Thermodynamics', 'Machine Learning and Data Science', 'Computational Methods and Simulations', 'Quantum Computing and Quantum Materials', 'Materials Characterization Techniques', 'Nanotechnology and Nanomaterials', 'Electrochemical and Energy Materials', 'Magnetic Materials and Properties', 'Chemical Reactions and Catalysis', 'Surface Science and Dynamics', 'Neuroscience and Brain Activity', 'Materials Informatics and Design', 'Advanced Manufacturing Techniques', 'Optoelectronics and Photonics', 'High-Entropy Alloys', 'Point Defects', 'Semiconductors and Band Gap Properties', 'Diffusion and Ionic Conductivity', 'Oxide Materials and Electronics', 'Topological Properties', 'Materials Science', 'Metallurgy', 'Solubility and Hydrogen Bonding', 'Data Interoperability in Materials Science'])

In [31]:
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Add the clusters nodes and edges
for node in cl_dict.keys():
    G.add_node(node, description =cl_dict[node]['description'])
    if len(cl_dict[node]['nodes']):
        for d in cl_dict[node]['nodes']:
            n = list(d.keys())[0]
            G.add_node(n, description =d[n])
            G.add_edge(node, n)
    


In [32]:
# Addition documents nodes
for t in dict_topics.keys():
    for d in dict_topics[t]['metadata']:
        n = d['source'].split('/')[-1].split('.')[0] # Name of the document
        G.add_node(n,  description ="pdf doc")
        G.add_edge(n, t)
        G.add_edge(t, n)
    


In [33]:
# Save the graph as GEXF including edge attributes
f_name = "graph_AI4AM_topics_v1.gexf"
nx.write_gexf(G, f_name)

print(f"Graph saved as {f_name}")

Graph saved as graph_AI4AM_topics_v1.gexf


In [68]:
def search_document(cluster, name="pdf doc"):
    
    # Find 1-step neighbors
    one_step_successors = set(G.neighbors(cluster))

    # Find 2-step neighbors
    two_step_successors = set()
    for neighbor in one_step_successors:
        two_step_successors.update(set(G.neighbors(neighbor)))
   
    # Collect the 2-step successors along with the filtered descriptions
    two_step_successors = [d for d in two_step_successors if  G.nodes[d]['description'] == name] # Only pdf docs

    return two_step_successors

In [69]:
search_document('Materials Science')

['90_AI4AM2024_Schleder_Gabriel_42',
 '77_AI4AM2024_Soljacic',
 '82_AI4AM2024_ghosh_aishwaryo_4',
 '74_AI4AM2024_Buitrago_Diaz_Juan_Camilo_79',
 '8_AI4AM2024_Pena_Corredor_Antonio_19']

In [70]:
search_document('Machine Learning and Data Science')

['72_AI4AM2024_Engelgardt_Dana_84',
 '80_AI4AM2024_Csanyi',
 '51_AI4AM2024_Malica_Cristiano_30',
 '20_AI4AM2024_Žugec_Ivan_55',
 '70_AI4AM2024_Persson_Gabriel_28',
 '11_AI4AM2024_Kruglov_Ivan_27',
 '2_AI4AM2024_palermo_vincenzo_63',
 '86_AI4AM2024_Garrido_Aldea_Jaime_85',
 '82_AI4AM2024_ghosh_aishwaryo_4',
 '56_AI4AM2024_Toher_Cormac_17',
 '87_AI4AM2024_Abio_Albert_88',
 '83_AI4AM2024_Delgado_Galindo_Pedro_Julian_87',
 '30_AI4AM2024_Barnard_Amanda_49',
 '5_AI4AM2024_Trinquet_Victor_77',
 '32_AI4AM2024_Joshi_Kavita_12',
 '42_AI4AM2024_Grzelczak_Marek_34',
 '40_AI4AM2024_Colnaghi_Timoteo_65',
 '89_AI4AM2024_Botti',
 '84_AI4AM2024_Carrasquilla',
 '6_AI4AM2024_Lazarev_Mikhail_80',
 '17_AI4AM2024_Riu_Vicente_Jordi_67',
 '67_AI4AM2024_Alcon_Isaac_3',
 '8_AI4AM2024_Pena_Corredor_Antonio_19',
 '41_AI4AM2024_Heras-Domingo_Javier_22',
 '90_AI4AM2024_Schleder_Gabriel_42',
 '48_AI4AM2024_Pozdnyakov_Sergey_15',
 '60_AI4AM2024_Marco_Moors_50',
 '64_AI4AM2024_Vozza_Mario_59',
 '45_AI4AM2024_Hakim_AMA