In [None]:
import os, json
#from langchain.chains import AnalyzeDocumentChain
from langchain_openai import ChatOpenAI

from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 

In [1]:
from contextual_rag import ContextualRAG


In [2]:
# Example usage
rag = ContextualRAG()
    
# Process PDF files
chunks, chunk_metadata = rag.upload_files("./data")
 

Using device: cuda
Loading embedding model: BAAI/bge-base-en-v1.5
Loading PDF files from ./data
Loaded 20 documents
Created 127 text chunks from 2 documents


In [3]:
   
# Make Vector DB
rag.make_db(chunks, chunk_metadata)
    
# Generate chunk embeddings and token IDs
#chunk_embeddings, token_ids_list = rag.get_chunk_emb(chunks)
 

Creating vector databases...
Extracting keywords from chunks...


Extracting keywords: 100%|██████████████████| 127/127 [00:00<00:00, 6396.83it/s]


Extracted 100 unique keywords across 127 chunks
Generating token-level embeddings...


Generating chunk embeddings: 100%|████████████| 127/127 [00:06<00:00, 19.16it/s]


Generated token embeddings for 127 chunks
Generating full chunk embeddings...


Generating full chunk embeddings: 100%|███████| 127/127 [00:06<00:00, 20.37it/s]


Generated embeddings for 127 chunks
Generating keyword embeddings...


Generating keyword embeddings: 100%|██████████| 127/127 [00:01<00:00, 79.98it/s]


Generated embeddings for 3044 keyword-context-document pairs
Average 30.56 contexts per keyword
Generating keyword-based chunk embeddings...


Generating chunk embeddings: 100%|████████████| 127/127 [00:02<00:00, 61.22it/s]

Generated embeddings for 127 chunks
Found keywords in 127 chunks
Average 24.06 keywords per chunk with keywords
Creating keyword embedding database...
Creating chunk embedding database...
Creating keyword-based chunk embedding database...
Creating vector database for efficient similarity search...
Vector databases created with 3044 keyword embeddings, 127 chunk embeddings, and 127 keyword-chunk embeddings.





In [4]:
rag.keywords

[['ﬁguration',
  'properties',
  'defects',
  'machine',
  'machine learning',
  'materials',
  'learning',
  'sparse representation',
  'huang',
  'open',
  'space',
  'crystals',
  'electronic',
  'lattice',
  'sparse'],
 ['ﬁguration',
  'materials',
  'properties',
  'defect',
  'https',
  'npj',
  'prediction',
  'methods',
  'neural',
  'atomic',
  'computational',
  'state',
  'quantum',
  'material',
  'energy'],
 ['atomic',
  'quantum',
  'defects',
  'properties',
  'ﬁguration',
  'space',
  'materials',
  'crystals',
  'property',
  'point',
  'methods',
  'crystal',
  'structure',
  'material'],
 ['ﬁguration',
  'defects',
  'space',
  'methods',
  'quantum',
  'learning',
  'point defects',
  'crystals',
  'property',
  'large',
  'prediction',
  'point',
  'state',
  'density',
  'machine'],
 ['learning',
  'machine',
  'machine learning',
  'methods',
  'calculations',
  'materials',
  'material',
  'networks',
  'neural networks',
  'functional',
  'graph neural',
  'meg

In [5]:
keyword_stats = rag.get_all_unique_keywords()
    

    
# Print a summary
print(f"Total unique keywords: {len(keyword_stats)}")


Total unique keywords: 100

Top 10 keywords by document coverage:
'ﬁguration': 2 documents, 21 total occurrences
'properties': 2 documents, 54 total occurrences
'defects': 2 documents, 144 total occurrences
'machine': 2 documents, 45 total occurrences
'machine learning': 2 documents, 43 total occurrences
'materials': 2 documents, 125 total occurrences
'learning': 2 documents, 60 total occurrences
'huang': 2 documents, 16 total occurrences
'open': 2 documents, 17 total occurrences
'space': 2 documents, 26 total occurrences
Total unique keywords: 100


In [6]:
    
# Save the system
rag.save(".data/context_rag_system_v2")

Saving model to .data/context_rag_system_v2...
Model saved successfully to .data/context_rag_system_v2
- Saved 3044 keyword embeddings
- Saved 127 chunk embeddings
- Saved 127 keyword-chunk embeddings
- Saved vectorizer: True
- Embedding model name: Not specified


In [7]:
query = "What is machine learning?"
results = rag.process_query(
        query, 
        top_k=5, 
        keyword_weight=0.3,  # Emphasize keyword matches
        chunk_weight=0.3,     # But also consider whole-chunk similarity
        keyword_chunk_weight=0.4,
    )

Retrieving: 2 keyword results, 2 chunk results, 1 keyword-chunk results
Searching keyword database with 3 query terms...
Searching chunk database...
Searching keyword-chunk database...
Retrieved 5 results


In [9]:
    
print("\nHybrid Query Results:")
for i, result in enumerate(results):
    print(f"\nResult {i+1} Score: {result['score']:.4f}: Original_score: {result['original_score']:.4f}")
    print(f"Document ID: {result.get('doc_id', '')}")
    print(f"Type of searching: {result.get('result_type', '')}")
    print(f"Keyword: {result.get('keyword','')}")
    print(f"Text: {result['text'][:200]}...")


Hybrid Query Results:

Result 1 Score: 0.2905: Original_score: 0.9683
Document ID: s41699-023-00369-1.pdf
Type of searching: chunk
Keyword: None
Text: materials are notoriously dif ﬁcult to model. Thus, it is very tempting to apply machine learning methods to such systems.
Unfortunately, there is only a handful of machine learning-friendly material ...

Result 2 Score: 0.2902: Original_score: 0.9674
Document ID: s41699-023-00369-1.pdf
Type of searching: chunk
Keyword: None
Text: ARTICLE OPEN
Unveiling the complex structure-property correlation of defects
in 2D materials based on high throughput datasets
Pengru Huang1, Ruslan Lukin2, Maxim Faleev2, Nikita Kazeev1,3, Abdalaziz ...

Result 3 Score: 0.0107: Original_score: 0.0356
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: learning
Text: the next generation of ( opto - ) electronic devices and other high technology applications. one of the most exciting characteristics of 2d crystals is the ability to tune thei

In [10]:
def query_searching(query):
    # Example query - search across all documents
    
    results = rag.process_query(
        query, 
        top_k=3, 
        keyword_weight=0.8,  # Emphasize keyword matches
        chunk_weight=0.2     # But also consider whole-chunk similarity
    )
    
    print(f"\nHybrid Query Results: \n ***  {query}")
    for i, result in enumerate(results):
        print(f"\nResult {i+1} Score: {result['score']:.4f}: Original_score: {result['original_score']:.4f}")
        print(f"Document ID: {result.get('doc_id', '')}")
        print(f"Type of searching: {result.get('result_type', '')}")
        print(f"Keyword: {result.get('keyword','')}")
        print(f"Text: {result['text'][:200]}...")

In [11]:
import pandas as pd

df_ground_true = pd.read_csv("../data/data_rag/QA_ai4mat_2articles.csv")

questions = df_ground_true['question'].values.tolist()
answers = df_ground_true['answer'].values.tolist()

In [12]:
for q in questions:
    #print("****",q)
    query_searching(q)
    #print(8*".-.", "\n")

Normalized weights: keyword=0.5714285714285715, chunk=0.14285714285714288, keyword_chunk=0.28571428571428575
Retrieving: 1 keyword results, 1 chunk results, 1 keyword-chunk results
Searching keyword database with 2 query terms...
Searching chunk database...
Searching keyword-chunk database...
Retrieved 3 results

Hybrid Query Results: 
 ***  * Which materials are in the dataset?

Result 1 Score: 0.1409: Original_score: 0.9864
Document ID: s41524-023-01062-z.pdf
Type of searching: chunk
Keyword: None
Text: processing in the mineral world. Mineral. Mag. 77, 275 –326 (2013).
44. Ong, S. P. et al. Python Materials Genomics (pymatgen): A robust, open-source
python library for materials analysis. Comput. Mat...

Result 2 Score: 0.0243: Original_score: 0.0425
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: materials
Text: a separate feature, which we call eos parity. fig. 2 the transition from full to sparse representation for an example mos 2supercell. a a full mos 2s

Searching chunk database...
Searching keyword-chunk database...
Retrieved 3 results

Hybrid Query Results: 
 ***  * How to browse the dataset?

Result 1 Score: 0.1377: Original_score: 0.9636
Document ID: s41699-023-00369-1.pdf
Type of searching: chunk
Keyword: None
Text: edge about the amplitude of the interaction between the defects
for each defect type. The formation energy for V3 defects
composed of one Mo vacancy and two S vacancies span the
Fig. 1 Machine learnin...

Result 2 Score: 0.0224: Original_score: 0.0393
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: dataset
Text: a separate feature, which we call eos parity. fig. 2 the transition from full to sparse representation for an example mos 2supercell. a a full mos 2structure with one mo →w, two s→se...

Result 3 Score: -0.0000: Original_score: -0.0001
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: resonance of the electron wave and the honeycomb lattice, the
wav

Searching chunk database...
Searching keyword-chunk database...
Retrieved 3 results

Hybrid Query Results: 
 ***  * What was the procedure for SRGNN quality evaluation?

Result 1 Score: 0.1381: Original_score: 0.9667
Document ID: s41524-023-01062-z.pdf
Type of searching: chunk
Keyword: None
Text: represented by a tuple ðvs
k;vr
k;ekÞ, where the superscripts s,r
denote the sender and the receiver nodes respectively. The
global state vector u2Rdurepresents the global state of the
system. In the ...

Result 2 Score: 0.0250: Original_score: 0.0437
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: quality
Text: a separate feature, which we call eos parity. fig. 2 the transition from full to sparse representation for an example mos 2supercell. a a full mos 2structure with one mo →w, two s→se...

Result 3 Score: -0.0006: Original_score: -0.0022
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: resonance of the electron wave and the 

Searching chunk database...
Searching keyword-chunk database...
Retrieved 3 results

Hybrid Query Results: 
 ***  * Can SRGNN be developed on a CPU, not a GPU?

Result 1 Score: 0.1388: Original_score: 0.9717
Document ID: s41524-023-01062-z.pdf
Type of searching: chunk
Keyword: None
Text: Government of the Russian Federation (Agreement No. 70-2021-00143 dd. 01.11.2021,
IGK 000000D730321P5Q0002). P.H. acknowledges the the supports of the National Key
Research and Development Program (20...

Result 2 Score: 0.0215: Original_score: 0.0376
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: cpu
Text: the next generation of ( opto - ) electronic devices and other high technology applications. one of the most exciting characteristics of 2d crystals is the ability to tune their properties via control...

Result 3 Score: 0.0009: Original_score: 0.0032
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: resonance of the electron wave and t

Searching chunk database...
Searching keyword-chunk database...
Retrieved 3 results

Hybrid Query Results: 
 ***  * How many structures are needed for SRGNN training?

Result 1 Score: 0.1399: Original_score: 0.9792
Document ID: s41524-023-01062-z.pdf
Type of searching: chunk
Keyword: None
Text: on the defects to conserve space. Drawings of example structures with 8 × 8 supercells are available in Supplementary Fig. 1.N. Kazeev et al.
5
Published in partnership with the Shanghai Institute of ...

Result 2 Score: 0.0299: Original_score: 0.0523
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: many
Text: a separate feature, which we call eos parity. fig. 2 the transition from full to sparse representation for an example mos 2supercell. a a full mos 2structure with one mo →w, two s→se...

Result 3 Score: -0.0003: Original_score: -0.0010
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: resonance of the electron wave and the honey

Searching chunk database...
Searching keyword-chunk database...
Retrieved 3 results

Hybrid Query Results: 
 ***  * Can pre-trained SRGNN be used out-of-the-box?

Result 1 Score: 0.1398: Original_score: 0.9784
Document ID: s41524-023-01062-z.pdf
Type of searching: chunk
Keyword: None
Text: e0
k¼ϕevs
i;vr
i;ek;u/C0/C1
: (5)
Node update rule aggregates the information from all the edges
Evi¼fe0
kje0
k2neighbors ðviÞgconnected to the node vi, the
node itself viand the global state u. We ca...

Result 2 Score: 0.0284: Original_score: 0.0497
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword
Keyword: used
Text: a separate feature, which we call eos parity. fig. 2 the transition from full to sparse representation for an example mos 2supercell. a a full mos 2structure with one mo →w, two s→se...

Result 3 Score: 0.0006: Original_score: 0.0020
Document ID: s41699-023-00369-1.pdf
Type of searching: keyword_chunk
Keyword: None
Text: resonance of the electron wave and the honeycomb la

In [16]:
def query_searching(query):
    # Example query - search across all documents
    
    results = rag.process_query(
        query, 
        top_k=3, 
        mode= 'keywords_only',
        keyword_weight=0.8,  # Emphasize keyword matches
        chunk_weight=0.2     # But also consider whole-chunk similarity
    )
    
    print(f"\nHybrid Query Results: \n ***  {query}")
    for i, result in enumerate(results):
        print(f"\nResult {i+1} (Score: {result['score']:.4f}):")
        print(f"Document ID: {result['doc_id']}")
        print(f"Page: {result['page_num']}")
        print(f"Keywords: {', '.join(result['keywords'])}")
        print(f"Relevant terms: {', '.join(result['relevant_terms'])}")
        print(f"Text: {result['chunk_text'][:200]}...")

In [17]:
for q in questions:
    #print("****",q)
    query_searching(q)
    #print(8*".-.", "\n")

Query terms: ['materials', 'dataset']

Hybrid Query Results: 
 ***  * Which materials are in the dataset?

Result 1 (Score: 0.1112):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: electrons, function, homo, atoms, site, lattice, mos, vacancies, wave, vacancy, wave function, sites, atomic, states
Relevant terms: 
Text: MoS
2structure, the atomic charge of S gained from Mo atoms was
calculated to be ~0.6 electrons. It is calculated to be less than 0.5
electrons for S atoms nearest to the Mo vacancy due to thebreaking...

Result 2 (Score: 0.1031):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: interaction, function, electron, pristine, electrons, atoms, lattice, energy, states, fig, wave function, wave, sites, atomic, vacancy
Relevant terms: 
Text: resonance of the electron wave and the honeycomb lattice, the
wave function of Mo vacancy has nodes at the Mo sites, where itchanges sign
36. This means that this wave function demonstrates
oscillator...

Result 3 (Score: 0.0236):


Hybrid Query Results: 
 ***  * What are the limitations of SRGNN?

Result 1 (Score: 0.1503):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: interaction, function, electron, pristine, electrons, atoms, lattice, energy, states, fig, wave function, wave, sites, atomic, vacancy
Relevant terms: 
Text: resonance of the electron wave and the honeycomb lattice, the
wave function of Mo vacancy has nodes at the Mo sites, where itchanges sign
36. This means that this wave function demonstrates
oscillator...

Result 2 (Score: 0.1152):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: electrons, function, homo, atoms, site, lattice, mos, vacancies, wave, vacancy, wave function, sites, atomic, states
Relevant terms: 
Text: MoS
2structure, the atomic charge of S gained from Mo atoms was
calculated to be ~0.6 electrons. It is calculated to be less than 0.5
electrons for S atoms nearest to the Mo vacancy due to thebreaking...

Result 3 (Score: 0.0287):
Document ID: s41699-023-00369-1.pdf
Pa

Query terms: ['srgnn', 'material', 'quality', 'property']

Hybrid Query Results: 
 ***  * What was the quality of SRGNN for each property and material?

Result 1 (Score: 0.1253):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: interaction, function, electron, pristine, electrons, atoms, lattice, energy, states, fig, wave function, wave, sites, atomic, vacancy
Relevant terms: 
Text: resonance of the electron wave and the honeycomb lattice, the
wave function of Mo vacancy has nodes at the Mo sites, where itchanges sign
36. This means that this wave function demonstrates
oscillator...

Result 2 (Score: 0.0933):
Document ID: s41699-023-00369-1.pdf
Page: 6
Keywords: electrons, function, homo, atoms, site, lattice, mos, vacancies, wave, vacancy, wave function, sites, atomic, states
Relevant terms: 
Text: MoS
2structure, the atomic charge of S gained from Mo atoms was
calculated to be ~0.6 electrons. It is calculated to be less than 0.5
electrons for S atoms nearest to the Mo vacancy du