In [1]:
import os
import shutil
import tempfile

import requests
from bs4 import BeautifulSoup
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI, OpenAIEmbeddings

import mlflow

assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."

In [2]:
def fetch_federal_document(url, div_class):  # noqa: D417
    """
    Scrapes the transcript of the Act Establishing Yellowstone National Park from the given URL.

    Args:
    url (str): URL of the webpage to scrape.

    Returns:
    str: The transcript text of the Act.
    """
    # Sending a request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Parsing the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Finding the transcript section by its HTML structure
        transcript_section = soup.find("div", class_=div_class)
        if transcript_section:
            transcript_text = transcript_section.get_text(separator="\n", strip=True)
            return transcript_text
        else:
            return "Transcript section not found."
    else:
        return f"Failed to retrieve the webpage. Status code: {response.status_code}"
    

def fetch_and_save_documents(url_list, doc_path):
    """
    Fetches documents from given URLs and saves them to a specified file path.

    Args:# Clean up our temporary directory that we created with our FAISS instance
shutil.rmtree(temporary_directory)
        url_list (list): List of URLs to fetch documents from.
        doc_path (str): Path to the file where documents will be saved.
    """
    for url in url_list:
        document = fetch_federal_document(url, "col-sm-9")
        with open(doc_path, "a") as file:
            file.write(document)


def create_faiss_database(document_path, database_save_directory, chunk_size=500, chunk_overlap=10):
    """
    Creates and saves a FAISS database using documents from the specified file.

    Args:

retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=vector_db.as_retriever())


# Log the retrievalQA chain
def load_retriever(persist_directory):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.load_local(
        document_path (str): Path to the file containing documents.
        database_save_directory (str): Directory where the FAISS database will be saved.
        chunk_size (int, optional): Size of each document chunk. Default is 500.
        chunk_overlap (int, optional): Overlap between consecutive chunks. Default is 10.

    Returns:
        FAISS database instance.
    """
    # Load documents from the specified file
    document_loader = TextLoader(document_path)
    raw_documents = document_loader.load()

    # Split documents into smaller chunks with specified size and overlap
    document_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    document_chunks = document_splitter.split_documents(raw_documents)

    # Generate embeddings for each document chunk
    embedding_generator = OpenAIEmbeddings()
    faiss_database = FAISS.from_documents(document_chunks, embedding_generator)

    # Save the FAISS database to the specified directory
    faiss_database.save_local(database_save_directory)

    return faiss_database

In [11]:
temporary_directory = tempfile.mkdtemp()

# doc_path = os.path.join(temporary_directory, "docs.txt")
doc_path = "local_text/paper.txt"
persist_dir = os.path.join(temporary_directory, "faiss_index")

print(doc_path)
url_listings = [
    "https://www.archives.gov/milestone-documents/act-establishing-yellowstone-national-park#transcript",
    "https://www.archives.gov/milestone-documents/sherman-anti-trust-act#transcript",
]

# fetch_and_save_documents(url_listings, doc_path)

vector_db = create_faiss_database(doc_path, persist_dir)

Created a chunk of size 1701, which is longer than the specified 500
Created a chunk of size 865, which is longer than the specified 500
Created a chunk of size 1327, which is longer than the specified 500
Created a chunk of size 1495, which is longer than the specified 500
Created a chunk of size 2024, which is longer than the specified 500
Created a chunk of size 971, which is longer than the specified 500
Created a chunk of size 680, which is longer than the specified 500
Created a chunk of size 855, which is longer than the specified 500
Created a chunk of size 737, which is longer than the specified 500
Created a chunk of size 1842, which is longer than the specified 500
Created a chunk of size 1267, which is longer than the specified 500
Created a chunk of size 517, which is longer than the specified 500
Created a chunk of size 590, which is longer than the specified 500
Created a chunk of size 665, which is longer than the specified 500
Created a chunk of size 1169, which is lon

local_text/paper.txt


In [12]:
mlflow.set_experiment("Legal RAG")

retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=vector_db.as_retriever())


# Log the retrievalQA chain
def load_retriever(persist_directory):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.load_local(
        persist_directory,
        embeddings,
        allow_dangerous_deserialization=True,  # This is required to load the index from MLflow
    )
    return vectorstore.as_retriever()


with mlflow.start_run() as run:
    model_info = mlflow.langchain.log_model(
        retrievalQA,
        name="retrieval_qa",
        loader_fn=load_retriever,
        persist_dir=persist_dir,
    )

def print_formatted_response(response_list, max_line_length=80):
    """
    Formats and prints responses with a maximum line length for better readability.

    Args:
    response_list (list): A list of strings representing responses.
    max_line_length (int): Maximum number of characters in a line. Defaults to 80.
    """
    for response in response_list:
        words = response.split()
        line = ""
        for word in words:
            if len(line) + len(word) + 1 <= max_line_length:
                line += word + " "
            else:
                print(line)
                line = word + " "
        print(line)

In [13]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

2025/07/15 11:24:48 INFO mlflow.tracking.fluent: Active model is set to the logged model with ID: m-503495f85042481c825464093cd20245
2025/07/15 11:24:48 INFO mlflow.tracking.fluent: Use `mlflow.set_active_model` to set the active model to a different one if needed.


In [15]:
answer1 = loaded_model.predict([{"query": "What does the document say?"}])

print_formatted_response(answer1)

I do not know. 


In [16]:
answer1 = loaded_model.predict([{"query": "Explain GCPATR"}])

print_formatted_response(answer1)

GCPATr is a transformer architecture that incorporates both graph convolution 
and position awareness. It uses a multi-head self-attention mechanism and 
incorporates graph convolution operations in critical places in its 
architecture. Its overall processing pipeline and the position aware 
self-attention block are depicted in Figure \ref{Overall_GCPATr_Pipeline}. 


In [17]:
answer1 = loaded_model.predict([{"query": "What is the reward function?"}])

print_formatted_response(answer1)

The reward function is defined as the decrease in the moving average of mean 
square error, penalized by the communication costs incurred during the round. 


In [19]:
answer1 = loaded_model.predict([{"query": "How does communication cost affect the results?"}])

print_formatted_response(answer1)

The communication cost affects the results by penalizing the amount of 
communication resources used in achieving the model training. This can lead to 
improved system efficiency by lowering the validation error and using minimal 
communication resources. 


In [21]:
answer1 = loaded_model.predict([{"query": "How does reinforcement learning affect the results?"}])

print_formatted_response(answer1)

The use of reinforcement learning in the proposed framework has shown to 
improve the final RUL prediction error and reduce communication costs, as 
evidenced by the results presented in the tables and figures. It also enables 
better exploration and convergence speed, as seen in the smoother convergence 
plots. However, further research is needed to optimize the value function and 
explore the performance of different reinforcement learning algorithms in noisy 
communication channels. 


In [10]:
print(retrievalQA.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [11]:
print(retrievalQA.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [None]:
# Clean up our temporary directory that we created with our FAISS instance
shutil.rmtree(temporary_directory)

In [13]:
temporary_directory

'/tmp/tmpoywmkr6k'