In [None]:
import os, json
#from langchain.chains import AnalyzeDocumentChain
from langchain_openai import ChatOpenAI

from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 

In [1]:
from contextual_rag import ContextualRAG


In [2]:
# Example usage
rag = ContextualRAG()
    
# Process PDF files
chunks, chunk_metadata = rag.upload_files("./data")
 

Using device: cuda
Loading embedding model: BAAI/bge-base-en-v1.5
Loading PDF files from ./data
Loaded 20 documents
Created 127 text chunks from 2 documents


In [3]:
   
# Make Vector DB
rag.make_db(chunks, chunk_metadata)
    
# Generate chunk embeddings and token IDs
#chunk_embeddings, token_ids_list = rag.get_chunk_emb(chunks)
 

Creating vector databases...
Extracting keywords from chunks...


Extracting keywords: 100%|██████████████████| 127/127 [00:00<00:00, 9167.80it/s]


Extracted 100 unique keywords across 127 chunks
Generating token-level embeddings...


Generating chunk embeddings: 100%|████████████| 127/127 [00:08<00:00, 15.21it/s]


Generated token embeddings for 127 chunks
Generated token IDs for 127 chunks
Token embedding shape: torch.Size([213, 768]) and Type: <class 'torch.Tensor'>
Generating full chunk embeddings...
Generated 127 chunk embeddings using mean pooling
Generating keyword embeddings...


Generating keyword embeddings: 100%|██████████| 127/127 [00:05<00:00, 24.01it/s]


Generated embeddings for 3034 keyword-context-document pairs
Average 30.56 contexts per keyword
Generating keyword-based chunk embeddings...


Generating chunk embeddings: 100%|████████████| 127/127 [00:03<00:00, 34.18it/s]


Generated embeddings for 127 chunks
Found keywords in 127 chunks
Average 24.06 keywords per chunk with keywords
Creating keyword embedding database...
Creating chunk embedding database...
Creating keyword-based chunk embedding database...
Creating vector database for efficient similarity search...
Vector databases created with 3034 keyword embeddings, 127 chunk embeddings, and 127 keyword-chunk embeddings.


In [4]:
keyword_stats = rag.get_all_unique_keywords()
    

    
# Print a summary
print(f"Total unique keywords: {len(keyword_stats)}")


Total unique keywords: 100

Top 10 keywords by document coverage:
'ﬁguration': 2 documents, 21 total occurrences
'properties': 2 documents, 54 total occurrences
'defects': 2 documents, 144 total occurrences
'machine': 2 documents, 45 total occurrences
'machine learning': 2 documents, 43 total occurrences
'materials': 2 documents, 125 total occurrences
'learning': 2 documents, 60 total occurrences
'huang': 2 documents, 16 total occurrences
'open': 2 documents, 17 total occurrences
'space': 2 documents, 26 total occurrences
Total unique keywords: 100


In [5]:
    
# Save the system
rag.save(".data/context_rag_system_v2")

Saving model to .data/context_rag_system_v2...
Model saved successfully to .data/context_rag_system_v2
- Saved 3034 keyword embeddings
- Saved 127 chunk embeddings
- Saved 127 keyword-chunk embeddings
- Saved vectorizer: True
- Embedding model name: Not specified


In [6]:
query = "What is machine learning?"
results = rag.process_query(
        query, 
        top_k=5, 
        keyword_weight=0.3,  # Emphasize keyword matches
        chunk_weight=0.3,     # But also consider whole-chunk similarity
        keyword_chunk_weight=0.4,
    )

Retrieving: 1 keyword results, 1 chunk results, 3 keyword-chunk results
Searching keyword database with 3 query terms...
Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results


In [7]:
    
print("\nHybrid Query Results:")
for i, result in enumerate(results):
    print(f"\nResult {i+1} Score: {result['score']:.4f}: Original_score: {result['original_score']:.4f}")
    print(f"Document ID: {result.get('doc_id', '')}")
    print(f"Type of searching: {result.get('result_type', '')}")
    print(f"Keyword: {result.get('keyword','')}")
    print(f"Text: {result['text'][:200]}...")


Hybrid Query Results:

Result 1 Score: 0.2725: Original_score: 0.6812
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: development of machine learning methods together have led toan exploding growth of in-silico material exploration in the areasof 2D materials, catalysts, batteries, photovoltaics, etc.
14–18.
Still, g...

Result 2 Score: 0.2724: Original_score: 0.6810
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: engage/chemrxiv/article-details/63b7181c1f24031e9a1789e0 .
28. Ward, L. et al. Matminer: An open source toolkit for materials data mining.
Comput. Mater. Sci. 152,6 0 –69 (2018).
29. Breiman, L. Rando...

Result 3 Score: 0.2711: Original_score: 0.6778
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (pymatgen): A robust, open-source
python l

In [8]:
results

[{'id': 'kw_chunk_72',
  'score': 0.27246263027191164,
  'original_score': 0.681156575679779,
  'result_type': 'keyword_chunk',
  'text': 'development of machine learning methods together have led toan exploding growth of in-silico material exploration in the areasof 2D materials, catalysts, batteries, photovoltaics, etc.\n14–18.\nStill, great dif ﬁculties arise when trying to apply machine\nlearning to predict properties of defects, which may be due to the\nlack of defect datasets and the challenge in the prediction ofquantum states. Also, there is a great deal of uncertainty when themachine learning of “black box ”nature encounters the nonlinear\nquantum properties in defects. Due to these reasons, there have\nbeen few studies of machine learning applied to defects in solids.The reported studies mainly focus on the prediction of the key\n1Institute for Functional Intelligent Materials, National University of Singapore, 117544 Singapore, Singapore.2Innopolis University, Universitetska

In [9]:
def query_searching(query):
    # Example query - search across all documents
    
    results = rag.process_query(
        query, 
        top_k=5, 
        keyword_weight=0.3,  # Emphasize keyword matches
        chunk_weight=0.3,     # But also consider whole-chunk similarity
        keyword_chunk_weight=0.4,
    )
    
    print(f"\nHybrid Query Results: \n ***  {query}")
    for i, result in enumerate(results):
        print(f"\nResult {i+1} Score: {result['score']:.4f}: Original_score: {result['original_score']:.4f}")
        print(f"Document ID: {result.get('doc_id', '')}")
        print(f"Type of searching: {result.get('result_type', '')}")
        print(f"Keyword: {result.get('keyword','')}")
        print(f"Text: {result['text'][:200]}... \n\n")
        

In [10]:
import pandas as pd

df_ground_true = pd.read_csv("../data/data_rag/QA_ai4mat_2articles.csv")

questions = df_ground_true['question'].values.tolist()
answers = df_ground_true['answer'].values.tolist()

In [11]:
for q in questions:
    #print("****",q)
    query_searching(q)
    #print(8*".-.", "\n")

Retrieving: 1 keyword results, 1 chunk results, 3 keyword-chunk results
Searching keyword database with 2 query terms...
Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results

Hybrid Query Results: 
 ***  * Which materials are in the dataset?

Result 1 Score: 0.3179: Original_score: 0.7949
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: within hours, such computations are not scalable. The propertiesof each new defect has to be calculated from scratch. The state-of-the-art research paradigm integrated of high throughput simula-
tions... 



Result 2 Score: 0.3169: Original_score: 0.7924
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (pymatgen): A robust, open-source
python library for materials analysis. Comput. Mat... 



Result 3 Score: 0.3161: Original_score

Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results

Hybrid Query Results: 
 ***  * How to browse the dataset?

Result 1 Score: 0.2611: Original_score: 0.6528
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: DATA AVAILABILITY
The datasets analyzed during this study are available at https://
research.constructor.tech/p/2d-defects-prediction .
CODE AVAILABILITY
Code used to calculate the results of this stu... 



Result 2 Score: 0.2563: Original_score: 0.6408
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: defect properties, the structure of the datasets should be carefully
designed. Our datasets are established in two groups, including
one structured and the other dispersive (Fig. 1b). The structured
d... 



Result 3 Score: 0.2500: Original_score: 0.6249
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: small subspace of the whole defe

Searching keyword-chunk database...
Retrieved 5 results

Hybrid Query Results: 
 ***  * How to cite the dataset?

Result 1 Score: 0.2664: Original_score: 0.6660
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: combined test dataset. Error indicates the standard deviation of the results obtained from 12 experiments with the same datasets and model parameter s, but
different random initialization. Bold font i... 



Result 2 Score: 0.2632: Original_score: 0.6580
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: indicated otherwise in a credit line to the material. If material is not included in thearticle ’s Creative Commons license and your intended use is not permitted by statutory
regulation or exceeds th... 



Result 3 Score: 0.2601: Original_score: 0.6502
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: graphs. Published at the International Conference on Learning 

Retrieved 5 results

Hybrid Query Results: 
 ***  * How to run the code?

Result 1 Score: 0.2292: Original_score: 0.5731
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: DATA AVAILABILITY
The datasets analyzed during this study are available at https://
research.constructor.tech/p/2d-defects-prediction .
CODE AVAILABILITY
Code used to calculate the results of this stu... 



Result 2 Score: 0.2229: Original_score: 0.5573
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: 38. Kresse, G. & Furthmüller, J. Ef ﬁcient iterative schemes for ab initio total-energy
calculations using a plane-wave basis set. Phys. Rev. B 54, 11169 (1996).
39. Kresse, G. & Furthmüller, J. Ef ﬁc... 



Result 3 Score: 0.2227: Original_score: 0.5568
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: University.
AUTHOR CONTRIBUTIONS
P.H., A.U., and K.S.N. conceived the research; P.H. and R.L. done mos

Searching keyword database with 3 query terms...
Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results

Hybrid Query Results: 
 ***  * Which materials has the SRGNN been trained on?

Result 1 Score: 0.3031: Original_score: 0.7578
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: section, we outline the message-passing neural network proposedby Battaglia et al.
41. Those became rather popular for analyzing
material structure11.
To prepare a training sample, a graph is construc... 



Result 2 Score: 0.3008: Original_score: 0.7520
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: 13. Schütt, K. T., Sauceda, H. E., Kindermans, P.-J., Tkatchenko, A. & Müller, K.-R.
Schnet –a deep learning architecture for molecules and materials. J. Chem. Phys.
148, 241722 (2018).
14. Gasteiger,... 



Result 3 Score: 0.2976: Original_score: 0.7440
Document ID: s41699-023-00369-1.pdf
Type of searchin

Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results

Hybrid Query Results: 
 ***  * How much computational resources does SRGNN require for training?

Result 1 Score: 0.2989: Original_score: 0.7472
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: prediction quality. The prediction quality for MoS 2and WSe 2is
improved by a factor of 1.3 –4.8, but this is outweighted by a factor
of 1.06 –1.15 increase in MAE for the other materials. Coinciden-
... 



Result 2 Score: 0.2944: Original_score: 0.7360
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: section, we outline the message-passing neural network proposedby Battaglia et al.
41. Those became rather popular for analyzing
material structure11.
To prepare a training sample, a graph is construc... 



Result 3 Score: 0.2920: Original_score: 0.7299
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None

Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results

Hybrid Query Results: 
 ***  * How well do SRGNN results correspond to experiment?

Result 1 Score: 0.2932: Original_score: 0.7331
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: h-BN High 244 ± 24 227 ± 6 233 ± 4 208±3 260 ± 14
MoS 2 Low 187 ± 180 46 ± 42 30 ± 2 26.7 ± 0.8 5.7±0.2
WSe 2 Low 236 ± 224 64 ± 46 32 ± 5 18.3 ± 0.6 8.1±0.6
Sparse (MEGNet) is our representation impl... 



Result 2 Score: 0.2890: Original_score: 0.7224
Document ID: s41524-023-01062-z.pdf
Type of searching: keyword_chunk
Keyword: None
Text: based descriptors. Graph neural networks have several valuable
properties that make them uniquely suitable for modeling atomic
systems: invariance to permutations, rotations, and translation;
1Institu... 



Result 3 Score: 0.2888: Original_score: 0.7221
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: Table 1