In [32]:
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
import os
import shutil
from google.colab import userdata
from langchain.prompts import ChatPromptTemplate
import argparse
from langchain_openai import ChatOpenAI

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
openai_api_key = os.environ['OPENAI_API_KEY']

In [23]:
# Set my path to the directory with all my files I will use for RAG
DATA_PATH = "/content/sample_data/rag_data"

# Create helper function to load all docs and return
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob = "*.pdf")
    documents = loader.load()
    return documents

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create object to split all document text into chunks to then vectorize
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 500,
    length_function = len,
    add_start_index = True)

# Get all documents and chunk
documents = load_documents()
chunks = text_splitter.split_documents(documents)
print(f"Number of chunks: {len(chunks)} and Number of Document")

Number of chunks: 126 and Number of Document


In [27]:
# Next pieces of code will vectorize and store data in Chroma
CHROMA_PATH = "/content/chroma_db"

if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

db = Chroma.from_documents(
    documents = chunks,
    embedding = OpenAIEmbeddings(),
    persist_directory=CHROMA_PATH
)

db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}")

Saved 126 chunks to /content/chroma_db


In [28]:
# This is the template for the prompts we will be sending into our model when
# we query it

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


In [35]:
def get_response(query_text):
    # Prepare the DB.
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

In [36]:
get_response("What is constituted as cheating on assignemnts?")

Human: 
Answer the question based only on the following context:

Violations are listed in alphabetical order; the order of violations is not indicative of the seriousness of each violation. Repeated violations, multiple violations, or the severity of the misconduct may heighten the university’s response, which could include suspension or expulsion from the university and/or cancellation of the Residence Hall and Dining License Agreement.

Where permitted by applicable law and university policy, the university reserves the right to notify parents when a student has been referred to OSCCR.

ACADEMIC INTEGRITY

As deﬁned in the Academic Integrity Policy (https://catalog.northeastern.edu/handbook/policies-regulations/academic-integrity/).

AIDING AND ABETTING

Knowingly assisting with or cooperating in an act or action that violates the Code. A student may be held responsible as though the student were a direct participant in the violation, even if information indicates the student was no