In [None]:
import os, json
#from langchain.chains import AnalyzeDocumentChain
from langchain_openai import ChatOpenAI

from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 

In [1]:
from contextual_rag import ContextualRAG


In [2]:
# Example usage
rag = ContextualRAG()
    
# Process PDF files
chunks, chunk_metadata = rag.upload_files("./data")
 

Using device: cuda
Loading embedding model: BAAI/bge-base-en-v1.5
Loading PDF files from ./data
Loaded 20 documents
Created 127 text chunks from 2 documents


In [3]:
   
# Extract keywords from each chunk
keywords = rag.get_keywords(chunks)
    
# Generate chunk embeddings and token IDs
chunk_embeddings, token_ids_list = rag.get_chunk_emb(chunks)
 

Extracting keywords: 100%|██████████████████| 127/127 [00:00<00:00, 5533.44it/s]


Extracted 100 unique keywords across 127 chunks


Generating chunk embeddings: 100%|████████████| 127/127 [00:06<00:00, 18.55it/s]

Generated token embeddings for 127 chunks





In [4]:
keywords[100]

['wave',
 'atoms',
 'electrons',
 'vacancies',
 'lattice',
 'atomic',
 'states',
 'site',
 'vacancy',
 'wave function',
 'function',
 'sites',
 'homo',
 'mos',
 'structure']

In [5]:
# Generate keyword embeddings with context and document ID
keyword_context_doc_embeddings = rag.get_keyword_emb(
    keywords, chunk_embeddings, token_ids_list, chunks, chunk_metadata
    )

Generating keyword embeddings: 100%|██████████| 127/127 [00:01<00:00, 87.33it/s]

Generated embeddings for 3067 keyword-context-document pairs
Average 31.42 contexts per keyword





In [6]:
keyword_stats = rag.get_all_unique_keywords()
    

    
# Print a summary
print(f"Total unique keywords: {len(keyword_stats)}")


Total unique keywords: 98

Top 10 keywords by document coverage:
'con ﬁguration': 2 documents, 21 total occurrences
'ﬁguration': 2 documents, 21 total occurrences
'properties': 2 documents, 55 total occurrences
'defects': 2 documents, 144 total occurrences
'con': 2 documents, 37 total occurrences
'machine': 2 documents, 42 total occurrences
'machine learning': 2 documents, 42 total occurrences
'materials': 2 documents, 125 total occurrences
'learning': 2 documents, 60 total occurrences
'open': 2 documents, 17 total occurrences
Total unique keywords: 98


In [7]:
    
# Create vector database
rag.make_db(keyword_context_doc_embeddings, chunks, chunk_metadata)

Created keyword vector database with 3067 entries


Generating full chunk embeddings: 100%|███████| 127/127 [00:06<00:00, 20.28it/s]

Generated embeddings for 127 chunks
Created chunk vector database with 127 entries





In [8]:
    
# Save the system
rag.save(".data/context_rag_system")

Saved RAG system to .data/context_rag_system


In [9]:
query = "What is machine learning?"
results = rag.process_query(
        query, 
        top_k=3, 
        keyword_weight=0.7,  # Emphasize keyword matches
        chunk_weight=0.3     # But also consider whole-chunk similarity
    )
    
print("\nHybrid Query Results:")
for i, result in enumerate(results):
    print(f"\nResult {i+1} (Score: {result['score']:.4f}):")
    print(f"Document ID: {result['doc_id']}")
    print(f"Page: {result['page_num']}")
    print(f"Keywords: {', '.join(result['keywords'])}")
    print(f"Relevant terms: {', '.join(result['relevant_terms'])}")
    print(f"Text: {result['chunk_text'][:200]}...")


Query terms: ['machine', 'learning', 'machine learning']

Hybrid Query Results:

Result 1 (Score: 0.2905):
Document ID: s41699-023-00369-1.pdf
Page: 0
Keywords: model, datasets, pristine, database, available, data, defect, learning, dft, material, materials, machine learning, wse, machine, properties
Relevant terms: machine, learning, machine learning
Text: materials are notoriously dif ﬁcult to model. Thus, it is very tempting to apply machine learning methods to such systems.
Unfortunately, there is only a handful of machine learning-friendly material ...

Result 2 (Score: 0.2902):
Document ID: s41699-023-00369-1.pdf
Page: 0
Keywords: defects, model, high, huang, datasets, property, learning, structure, materials, methods, machine learning, machine, based, open, properties
Relevant terms: machine, learning, machine learning
Text: ARTICLE OPEN
Unveiling the complex structure-property correlation of defects
in 2D materials based on high throughput datasets
Pengru Huang1, Ruslan Lukin2,

In [13]:
def query_searching(query):
    # Example query - search across all documents
    
    results = rag.process_query(
        query, 
        top_k=3, 
        keyword_weight=0.8,  # Emphasize keyword matches
        chunk_weight=0.2     # But also consider whole-chunk similarity
    )
    
    print(f"\nHybrid Query Results: \n ***  {query}")
    for i, result in enumerate(results):
        print(f"\nResult {i+1} (Score: {result['score']:.4f}):")
        print(f"Document ID: {result['doc_id']}")
        print(f"Page: {result['page_num']}")
        print(f"Keywords: {', '.join(result['keywords'])}")
        print(f"Relevant terms: {', '.join(result['relevant_terms'])}")
        print(f"Text: {result['chunk_text'][:200]}...")

In [11]:
import pandas as pd

df_ground_true = pd.read_csv("../data/data_rag/QA_ai4mat_2articles.csv")

questions = df_ground_true['question'].values.tolist()
answers = df_ground_true['answer'].values.tolist()

In [14]:
for q in questions:
    #print("****",q)
    query_searching(q)
    #print(8*".-.", "\n")

Query terms: ['materials', 'dataset']

Hybrid Query Results: 
 ***  * Which materials are in the dataset?

Result 1 (Score: 0.1973):
Document ID: s41524-023-01062-z.pdf
Page: 8
Keywords: density, ﬁguration, information, con ﬁguration, used, learning, materials, machine, machine learning, phys, mater, structure, open, rev
Relevant terms: materials
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (pymatgen): A robust, open-source
python library for materials analysis. Comput. Mat...

Result 2 (Score: 0.1971):
Document ID: s41699-023-00369-1.pdf
Page: 1
Keywords: defects, huang, interaction, datasets, published, defect, formation energy, space, learning, vacancies, materials, machine learning, machine, partnership
Relevant terms: materials, dataset
Text: edge about the amplitude of the interaction between the defects
for each defect type. The formation energy for V3 defects
composed of one Mo vacancy and two S vacanc

Query terms: ['limitations', 'srgnn']

Hybrid Query Results: 
 ***  * What are the limitations of SRGNN?

Result 1 (Score: 0.2084):
Document ID: s41524-023-01062-z.pdf
Page: 0
Keywords: neural, nature, networks, singapore, published, university, graph, materials, graph neural, neural networks, properties, atomic, based, partnership
Relevant terms: 
Text: based descriptors. Graph neural networks have several valuable
properties that make them uniquely suitable for modeling atomic
systems: invariance to permutations, rotations, and translation;
1Institu...

Result 2 (Score: 0.1951):
Document ID: s41524-023-01062-z.pdf
Page: 4
Keywords: supplementary, model, prediction, quantum, full, sparse representation, con ﬁgurations, representation, ﬁgurations, dataset, megnet, wse, sparse, low, combined
Relevant terms: 
Text: prediction quality. The prediction quality for MoS 2and WSe 2is
improved by a factor of 1.3 –4.8, but this is outweighted by a factor
of 1.06 –1.15 increase in MAE for the oth

Query terms: ['srgnn', 'material', 'quality', 'property']

Hybrid Query Results: 
 ***  * What was the quality of SRGNN for each property and material?

Result 1 (Score: 0.2035):
Document ID: s41524-023-01062-z.pdf
Page: 4
Keywords: supplementary, defects, structures, npj, available, published, space, computational, fig, materials, partnership
Relevant terms: material
Text: on the defects to conserve space. Drawings of example structures with 8 × 8 supercells are available in Supplementary Fig. 1.N. Kazeev et al.
5
Published in partnership with the Shanghai Institute of ...

Result 2 (Score: 0.1953):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: ﬁguration, con ﬁguration, function, band, defect, energy, fig, large, wave function, vacancies, wave, sites, con, levels, states
Relevant terms: 
Text: vacancies is smaller than that of the third con ﬁguration, the two
vacancies occupy lattice sites where there is a knot of their wavefunctions. The wave functions of the HOMO retain that

In [13]:
def query_searching(query):
    # Example query - search across all documents
    
    results = rag.process_query(
        query, 
        top_k=3, 
        mode: str = 'keyword',
        keyword_weight=0.8,  # Emphasize keyword matches
        chunk_weight=0.2     # But also consider whole-chunk similarity
    )
    
    print(f"\nHybrid Query Results: \n ***  {query}")
    for i, result in enumerate(results):
        print(f"\nResult {i+1} (Score: {result['score']:.4f}):")
        print(f"Document ID: {result['doc_id']}")
        print(f"Page: {result['page_num']}")
        print(f"Keywords: {', '.join(result['keywords'])}")
        print(f"Relevant terms: {', '.join(result['relevant_terms'])}")
        print(f"Text: {result['chunk_text'][:200]}...")

In [14]:
for q in questions:
    #print("****",q)
    query_searching(q)
    #print(8*".-.", "\n")

Query terms: ['materials', 'dataset']

Hybrid Query Results: 
 ***  * Which materials are in the dataset?

Result 1 (Score: 0.1973):
Document ID: s41524-023-01062-z.pdf
Page: 8
Keywords: density, ﬁguration, information, con ﬁguration, used, learning, materials, machine, machine learning, phys, mater, structure, open, rev
Relevant terms: materials
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (pymatgen): A robust, open-source
python library for materials analysis. Comput. Mat...

Result 2 (Score: 0.1971):
Document ID: s41699-023-00369-1.pdf
Page: 1
Keywords: defects, huang, interaction, datasets, published, defect, formation energy, space, learning, vacancies, materials, machine learning, machine, partnership
Relevant terms: materials, dataset
Text: edge about the amplitude of the interaction between the defects
for each defect type. The formation energy for V3 defects
composed of one Mo vacancy and two S vacanc

Query terms: ['limitations', 'srgnn']

Hybrid Query Results: 
 ***  * What are the limitations of SRGNN?

Result 1 (Score: 0.2084):
Document ID: s41524-023-01062-z.pdf
Page: 0
Keywords: neural, nature, networks, singapore, published, university, graph, materials, graph neural, neural networks, properties, atomic, based, partnership
Relevant terms: 
Text: based descriptors. Graph neural networks have several valuable
properties that make them uniquely suitable for modeling atomic
systems: invariance to permutations, rotations, and translation;
1Institu...

Result 2 (Score: 0.1951):
Document ID: s41524-023-01062-z.pdf
Page: 4
Keywords: supplementary, model, prediction, quantum, full, sparse representation, con ﬁgurations, representation, ﬁgurations, dataset, megnet, wse, sparse, low, combined
Relevant terms: 
Text: prediction quality. The prediction quality for MoS 2and WSe 2is
improved by a factor of 1.3 –4.8, but this is outweighted by a factor
of 1.06 –1.15 increase in MAE for the oth

Query terms: ['srgnn', 'material', 'quality', 'property']

Hybrid Query Results: 
 ***  * What was the quality of SRGNN for each property and material?

Result 1 (Score: 0.2035):
Document ID: s41524-023-01062-z.pdf
Page: 4
Keywords: supplementary, defects, structures, npj, available, published, space, computational, fig, materials, partnership
Relevant terms: material
Text: on the defects to conserve space. Drawings of example structures with 8 × 8 supercells are available in Supplementary Fig. 1.N. Kazeev et al.
5
Published in partnership with the Shanghai Institute of ...

Result 2 (Score: 0.1953):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: ﬁguration, con ﬁguration, function, band, defect, energy, fig, large, wave function, vacancies, wave, sites, con, levels, states
Relevant terms: 
Text: vacancies is smaller than that of the third con ﬁguration, the two
vacancies occupy lattice sites where there is a knot of their wavefunctions. The wave functions of the HOMO retain that