In [None]:
import os, json
#from langchain.chains import AnalyzeDocumentChain
from langchain_openai import ChatOpenAI

from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 

In [2]:
from contextual_rag import ContextualRAG


In [3]:
# Example usage
rag = ContextualRAG()
    
# Process PDF files
chunks, chunk_metadata = rag.upload_files("./data/data_rag/pdf")
 

Using device: cuda
Loading embedding model: BAAI/bge-base-en-v1.5
Loading PDF files from ./data/data_rag/pdf
Loaded 30 documents
Created 193 text chunks from 3 documents


In [4]:
   
# Extract keywords from each chunk
keywords = rag.get_keywords(chunks)
    
# Generate chunk embeddings and token IDs
chunk_embeddings, token_ids_list = rag.get_chunk_emb(chunks)
 

Extracting keywords: 100%|██████████████████| 193/193 [00:00<00:00, 3826.00it/s]


Extracted 127 unique keywords across 193 chunks


Generating chunk embeddings: 100%|████████████| 193/193 [00:09<00:00, 19.45it/s]

Generated token embeddings for 193 chunks





In [7]:
keywords[100]

['eos',
 'crystals',
 'density',
 'low',
 'datasets',
 'prediction',
 'sparse',
 'properties',
 'information',
 'homo lumo',
 'nature',
 'space',
 'combined',
 'full',
 'data']

In [6]:
# Generate keyword embeddings with context and document ID
keyword_context_doc_embeddings = rag.get_keyword_emb(
    keywords, chunk_embeddings, token_ids_list, chunks, chunk_metadata
    )

Generating keyword embeddings: 100%|██████████| 193/193 [00:02<00:00, 94.76it/s]

Generated embeddings for 4633 keyword-context-document pairs
Average 37.52 contexts per keyword





In [9]:
keyword_stats = rag.get_all_unique_keywords()
    

    
# Print a summary
print(f"Total unique keywords: {len(keyword_stats)}")


Total unique keywords: 124

Top 10 keywords by document coverage:
'properties': 3 documents, 75 total occurrences
'con ﬁguration': 3 documents, 34 total occurrences
'ﬁguration': 3 documents, 34 total occurrences
'defects': 3 documents, 198 total occurrences
'con': 3 documents, 62 total occurrences
'machine': 3 documents, 61 total occurrences
'machine learning': 3 documents, 61 total occurrences
'materials': 3 documents, 173 total occurrences
'learning': 3 documents, 90 total occurrences
'space': 3 documents, 35 total occurrences
Total unique keywords: 124


In [10]:
keyword_stats.keys()

dict_keys(['properties', 'con ﬁguration', 'ﬁguration', 'defects', 'con', 'machine', 'machine learning', 'materials', 'learning', 'space', 'sparse representation', 'lattice', 'crystals', 'electronic', 'sparse', 'defect', 'https', 'network', 'neural network', 'npj', 'prediction', 'methods', 'atomic', 'state', 'quantum', 'atom', 'property', 'point', 'crystal', 'point defects', 'density', 'calculations', 'material', 'networks', 'neural networks', 'graph neural', 'megnet', 'dft', 'used', 'graph', 'datasets', 'database', 'high', 'wse', 'use', 'neural', 'nature', 'performance', 'based', 'partnership', 'published', 'information', 'edge', 'features', 'atoms', 'representation', 'structure', 'wave function', 'function', 'electrons', 'wave', 'states', 'fig', 'energy', 'band', 'sites', 'lumo', 'con ﬁgurations', 'gap', 'homo', 'ﬁgurations', 'formation', 'structures', 'pristine', 'single', 'data', 'dataset', 'interaction', 'number', 'vacancies', 'using', 'eos', 'vacancy', 'nodes', 'site', 'full', 'mo

In [11]:
    
# Create vector database
rag.make_db(keyword_context_doc_embeddings, chunks, chunk_metadata)

Created keyword vector database with 4633 entries


Generating full chunk embeddings: 100%|███████| 193/193 [00:09<00:00, 20.39it/s]

Generated embeddings for 193 chunks
Created chunk vector database with 193 entries





In [12]:
    
# Save the system
rag.save(".data/context_rag_system")

Saved RAG system to .data/context_rag_system


In [15]:
query = "What is machine learning?"
results = rag.process_query(
        query, 
        top_k=3, 
        keyword_weight=0.7,  # Emphasize keyword matches
        chunk_weight=0.3     # But also consider whole-chunk similarity
    )
    
print("\nHybrid Query Results:")
for i, result in enumerate(results):
    print(f"\nResult {i+1} (Score: {result['score']:.4f}):")
    print(f"Document ID: {result['doc_id']}")
    print(f"Page: {result['page_num']}")
    print(f"Keywords: {', '.join(result['keywords'])}")
    print(f"Relevant terms: {', '.join(result['relevant_terms'])}")
    print(f"Text: {result['chunk_text'][:200]}...")


Query terms: ['machine', 'machine learning', 'learning']

Hybrid Query Results:

Result 1 (Score: 0.3020):
Document ID: s41699-023-00369-1.pdf
Page: 1
Keywords: defect, learning, formation energy, machine, materials, partnership, datasets, vacancies, interaction, defects, fig, space, machine learning, edge
Relevant terms: machine, machine learning, learning
Text: edge about the amplitude of the interaction between the defects
for each defect type. The formation energy for V3 defects
composed of one Mo vacancy and two S vacancies span the
Fig. 1 Machine learnin...

Result 2 (Score: 0.2905):
Document ID: s41699-023-00369-1.pdf
Page: 0
Keywords: available, pristine, defect, learning, data, machine, materials, database, dft, material, datasets, properties, machine learning, wse, model
Relevant terms: machine, machine learning, learning
Text: materials are notoriously dif ﬁcult to model. Thus, it is very tempting to apply machine learning methods to such systems.
Unfortunately, there is onl

In [16]:
def query_searching(query):
    # Example query - search across all documents
    
    results = rag.process_query(
        query, 
        top_k=3, 
        keyword_weight=0.7,  # Emphasize keyword matches
        chunk_weight=0.3     # But also consider whole-chunk similarity
    )
    
    print(f"\nHybrid Query Results: \n ***  {query}")
    for i, result in enumerate(results):
        print(f"\nResult {i+1} (Score: {result['score']:.4f}):")
        print(f"Document ID: {result['doc_id']}")
        print(f"Page: {result['page_num']}")
        print(f"Keywords: {', '.join(result['keywords'])}")
        print(f"Relevant terms: {', '.join(result['relevant_terms'])}")
        print(f"Text: {result['chunk_text'][:200]}...")

In [17]:
import pandas as pd

df_ground_true = pd.read_csv("./data/data_rag/QA_ai4mat_2articles.csv")

questions = df_ground_true['question'].values.tolist()
answers = df_ground_true['answer'].values.tolist()

In [18]:
for q in questions:
    #print("****",q)
    query_searching(q)
    #print(8*".-.", "\n")

Query terms: ['materials', 'dataset']

Hybrid Query Results: 
 ***  * Which materials are in the dataset?

Result 1 (Score: 0.2959):
Document ID: s41524-023-01062-z.pdf
Page: 8
Keywords: atom, phys, learning, ﬁguration, machine, materials, rev, mater, density, features, con ﬁguration, information, machine learning, structure
Relevant terms: materials
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (pymatgen): A robust, open-source
python library for materials analysis. Comput. Mat...

Result 2 (Score: 0.2959):
Document ID: Sparse representation for machine learning the properties of defects in 2D materials.pdf
Page: 8
Keywords: atom, phys, learning, ﬁguration, machine, materials, rev, mater, density, features, con ﬁguration, information, machine learning, structure
Relevant terms: materials
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (p

Query terms: ['limitations', 'srgnn']

Hybrid Query Results: 
 ***  * What are the limitations of SRGNN?

Result 1 (Score: 0.2926):
Document ID: s41524-023-01062-z.pdf
Page: 4
Keywords: sparse representation, model, megnet, ﬁgurations, sparse, representation, features, combined, low, supplementary, quantum, con ﬁgurations, full, wse, prediction
Relevant terms: 
Text: prediction quality. The prediction quality for MoS 2and WSe 2is
improved by a factor of 1.3 –4.8, but this is outweighted by a factor
of 1.06 –1.15 increase in MAE for the other materials. Coinciden-
...

Result 2 (Score: 0.2926):
Document ID: Sparse representation for machine learning the properties of defects in 2D materials.pdf
Page: 4
Keywords: sparse representation, model, megnet, ﬁgurations, sparse, representation, features, combined, low, supplementary, quantum, con ﬁgurations, full, wse, prediction
Relevant terms: 
Text: prediction quality. The prediction quality for MoS 2and WSe 2is
improved by a factor of 1.3 –4.

Query terms: ['using srgnn', 'predicted', 'srgnn', 'using', 'properties', 'predicted using']

Hybrid Query Results: 
 ***  * Which properties can be predicted using SRGNN?

Result 1 (Score: 0.2953):
Document ID: s41524-023-01062-z.pdf
Page: 0
Keywords: graph neural, atomic, networks, materials, neural networks, based, nature, partnership, published, properties, graph, neural
Relevant terms: properties
Text: based descriptors. Graph neural networks have several valuable
properties that make them uniquely suitable for modeling atomic
systems: invariance to permutations, rotations, and translation;
1Institu...

Result 2 (Score: 0.2953):
Document ID: Sparse representation for machine learning the properties of defects in 2D materials.pdf
Page: 0
Keywords: graph neural, atomic, networks, materials, neural networks, based, nature, partnership, published, properties, graph, neural
Relevant terms: properties
Text: based descriptors. Graph neural networks have several valuable
properties that m