### Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines


In [2]:
#!pip install openai==0.28.1
#!pip install openai --upgrade
#!pip install ragas
#!pip install unstructured
#!pip install langchain[all]
#!pip install --upgrade langchain

#!pip install playwright
#!pip install -U selenium unstructured
#!pip install --upgrade langchain langchain-community langchainhub langchain-openai langchain-chroma bs4

In [5]:
#!pip install pydantic==2.5
#!pip install langchain-experimental==0.0.57
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=a8bf0301e1f28e7ecbbf9c999ba615bf5f506cd408f9aaefbcb74426293776d8
  Stored in directory: /home/oleg/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [1]:
import os, json
#import openai
#from langchain.chat_models import ChatOpenAI, ChatGooglePalm
from utils import OPENAI_API_KEY

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 
#os.environ["LANGCHAIN_TRACING_V2"] = "true"

#openai.api_key = os.environ['OPENAI_API_KEY']

In [24]:
import bs4
from langchain import hub
from langchain_community.document_loaders import SeleniumURLLoader, TextLoader
from langchain.document_loaders import WikipediaLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders.merge import MergedDataLoader


from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


In [3]:
llm = ChatOpenAI(model="gpt-4-0125-preview", temperature=0.)
emb_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [66]:

def get_questioins(path):
    loader = TextLoader(path)
    docs = loader.load()
    texts = docs[0].page_content.split('\n')
    questions = []
    for q in  texts:
        if "?" in q:
            questions.append(q)
    return questions

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def load_url_documets(list_urls, wiki=None):
    
    # Load, chunk and index the contents of the blog.
    
    loader_url =SeleniumURLLoader( list_urls) # URL Loader
    # Wikipedia Loader
    if  wiki is not None:
        if type(wiki) == str:
            loader_wiki = WikipediaLoader(query=query)
            loader = MergedDataLoader([loader_url, loader_wiki])
        elif type(wiki) == list:
            loader_wiki = []
            for q in wiki: 
                loader_wiki.append(WikipediaLoader(query=q))
            loader = MergedDataLoader([loader_url] + loader_wiki) # Merged Loader
    else:
        loader = loader_url
    
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    #text_splitter = SemanticChunker(emb_model) 
    splits = text_splitter.split_documents(docs)
    
    # Retrieve and generate using the relevant snippets of the blog.
    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())   
    retriever = vectorstore.as_retriever()
    return retriever, docs

In [67]:
qa_dict = {}

url_list = ["https://www.nature.com/articles/s41524-023-01062-z",
            "https://www.nature.com/articles/s41699-023-00369-1",
            "https://github.com/HSE-LAMBDA/ai4material_design/tree/main/docs/CONSTRUCTOR-MOCK.md"
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/CONSTRUCTOR.md",
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/DATA.md",
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/ENVIRONMENT.md",
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/GENERATING-CONSTRUCTOR.md",
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/GENERATING-MOCK.md",
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/PILOT.md",
            "https://github.com/HSE-LAMBDA/ai4material_design/blob/main/docs/SPARSE-PAPER.md"
          #  "https://www.nature.com/articles/s41377-024-01407-3",
          #  "https://www.nature.com/articles/s41565-023-01407-1",
          #  "https://www.nature.com/articles/s41699-023-00369-1",
           ]
                               
retriever, documents = load_url_documets(url_list, ["Density Functional Theory",  "Graph NN"])

In [68]:
len(documents)

58

In [69]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [70]:
### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [71]:
from langchain.document_loaders import TextLoader
ques = get_questioins("data/data_rag/Sparse representation - questions.txt")

In [72]:
qa_dict = {}

for q in ques:
    answer = conversational_rag_chain.invoke(
              {"input": q},
               config={"configurable": {"session_id": "abc123"}
              },  # constructs a key "abc123" in `store`.
             )["answer"]
    qa_dict[q] = answer

In [73]:
l_answer, l_question = [], []

for k,v in qa_dict.items():
    print(k)
    print()
    print(v)
    print("**********************************")
    print()
    l_answer.append(v)
    l_question.append(k)
        

* Which materials are in the dataset?

The dataset includes materials with and without defects, specifically focusing on TMDCs (transition metal dichalcogenides) and six represented 2D materials. It is based on high throughput DFT (Density Functional Theory) calculations.
**********************************

* How many structures are there in the dataset?

The dataset contains a total of 14,866 configurations, comprising 11,866 defect configurations in TMDCs and 3,000 configurations in six represented 2D materials.
**********************************

* How to obtain the dataset?

I don't have the specific details on how to obtain the dataset from the provided context. Typically, datasets like these might be accessible through academic publications, associated data repositories, or directly contacting the authors of the studies mentioned.
**********************************

* What is the dataset license?

The provided context does not specify the license under which the dataset is distri

In [74]:
name = "9docs&chatHistory&wiki_gpt4"
with open(f'data/data_rag/qa_dict_{name}.json', 'w') as fp:
    json.dump(qa_dict, fp)

In [75]:
for m in conversational_rag_chain.get_session_history("abc123").messages:
    
    if m.type == 'human':
        print('Human: ' + m.content)
    elif  m.type == 'ai':
        print('AI: ' + m.content)
        print()

Human: * Which materials are in the dataset?
AI: The dataset includes materials with and without defects, specifically focusing on TMDCs (transition metal dichalcogenides) and six represented 2D materials. It is based on high throughput DFT (Density Functional Theory) calculations.

Human: * How many structures are there in the dataset?
AI: The dataset contains a total of 14,866 configurations, comprising 11,866 defect configurations in TMDCs and 3,000 configurations in six represented 2D materials.

Human: * How to obtain the dataset?
AI: I don't have the specific details on how to obtain the dataset from the provided context. Typically, datasets like these might be accessible through academic publications, associated data repositories, or directly contacting the authors of the studies mentioned.

Human: * What is the dataset license?
AI: The provided context does not specify the license under which the dataset is distributed. Dataset licenses can vary, often aiming to define how the 

In [76]:
import pandas as pd


l_answer, l_question = [], []

for m in conversational_rag_chain.get_session_history("abc123").messages:
    
    if m.type == 'human':
        l_question.append(m.content)
    elif  m.type == 'ai':
        l_answer.append(m.content)

df = pd.DataFrame()
df['question'] = l_question
#df['Content'] = doc_url[0].metadata['description']
df['answer'] = l_answer
df['number of documets'] = len(documents)
#for i in range(len(documents)):
    #df[f'Content_{i}'] = documents[i].metadata['description']
    #df[f'Titles_{i}'] = documents[i].metadata['title']
    #df[f'Urls_{i}'] = documents[i].metadata['source']

df.to_csv(f"data/data_rag/{name}.csv", index=False)

In [41]:
answer = conversational_rag_chain.invoke(
              {"input": "Is there any relaion between defects and HUM0-LUMO gap?"},
               config={"configurable": {"session_id": "abc123"}
              },  # constructs a key "abc123" in `store`.
             )["answer"]
answer

"Yes, there is a significant relationship between defects and the HOMO-LUMO (Highest Occupied Molecular Orbital - Lowest Unoccupied Molecular Orbital) gap in materials. Defects in a material can introduce localized states within the band gap, which can alter the energy levels of the HOMO and LUMO, thereby affecting the size of the HOMO-LUMO gap. This alteration can influence the electronic, optical, and chemical properties of the material. For instance, the presence of defects can reduce the band gap, making a material more conductive or altering its absorption properties, which is particularly relevant in applications like photovoltaics and semiconductors. The specific impact depends on the nature of the defects and their interaction with the material's electronic structure."

In [42]:
answer = conversational_rag_chain.invoke(
              {"input": "Which defects have more effect on HUM0-LUMO gap?"},
               config={"configurable": {"session_id": "abc123"}
              },  # constructs a key "abc123" in `store`.
             )["answer"]
answer

"The effect of defects on the HOMO-LUMO gap largely depends on the type and concentration of the defects within the material. Generally, deep-level defects, such as vacancy defects or substitutional defects, have a more pronounced effect on the HOMO-LUMO gap compared to shallow-level defects. \n\n- **Vacancy defects**, where atoms are missing from the lattice, can introduce deep states within the band gap that significantly alter the electronic structure and, consequently, the HOMO-LUMO gap.\n- **Substitutional defects**, where one type of atom is replaced by another, can also introduce localized states that affect the band gap depending on the electronic nature of the substituting atom.\n\nThe impact of these defects on the HOMO-LUMO gap is critical because it can change the material's optical and electronic properties, influencing its suitability for applications like semiconductors, photovoltaics, and sensors. The extent of the effect also depends on the defect's energy level relati

In [44]:
answer = conversational_rag_chain.invoke(
              {"input": "Which atoms as defects  are more effective to increase HUM0-LUMO gap?"},
               config={"configurable": {"session_id": "abc123"}
              },  # constructs a key "abc123" in `store`.
             )["answer"]
print(answer)

Atoms that are effective in increasing the HOMO-LUMO gap as defects typically have characteristics that significantly alter the electronic structure of the host material. These include:

1. **Atoms with a High Electronegativity**: Atoms that are more electronegative than the host material atoms can pull the electron density towards themselves, potentially increasing the energy of the lowest unoccupied molecular orbital (LUMO) and thus widening the HOMO-LUMO gap.

2. **Atoms with a Different Number of Valence Electrons**: Atoms that introduce a different number of valence electrons compared to the atoms they replace in the host material can create localized energy states within the band structure. If these states are closer to the conduction band, they can effectively increase the HOMO-LUMO gap.

3. **Atoms that Induce Lattice Strain**: Atoms significantly larger or smaller than the host atoms can induce lattice strain, which can modify the band structure and potentially increase the ba

In [45]:
answer = conversational_rag_chain.invoke(
              {"input": "Which material's crystal  are more perspective to get a materials with increasing HUM0-LUMO gap?"},
               config={"configurable": {"session_id": "abc123"}
              },  # constructs a key "abc123" in `store`.
             )["answer"]
print(answer)

Materials with a crystal structure that allows for significant modification of their electronic properties through defect engineering are particularly perspective for increasing the HOMO-LUMO gap. These include:

1. **Semiconductors with Wide Band Gaps**: Materials like zinc oxide (ZnO) and titanium dioxide (TiO2) have wide band gaps that can be further manipulated through doping or defect introduction, making them suitable for applications requiring high-energy photon absorption or emission.

2. **Two-Dimensional (2D) Materials**: Graphene, transition metal dichalcogenides (TMDCs) like MoS2 and WSe2, and hexagonal boron nitride (h-BN) are highly sensitive to defects due to their two-dimensional nature. Defects or dopants in these materials can significantly alter their electronic and optical properties, including the HOMO-LUMO gap.

3. **Perovskites**: Hybrid organic-inorganic perovskites have tunable band gaps and are promising for photovoltaic applications. Defect engineering in the