Libraries

In [1]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores.chroma import Chroma
from sklearn.metrics.pairwise import cosine_similarity
# from langchain_community.vectorstores import DocArrayInMemorySearch
# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_pinecone import PineconeVectorStore
import os
import shutil
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from pinecone import Pinecone
from pinecone.exceptions import PineconeException
import time
from langchain_openai import OpenAIEmbeddings

Paths to information

In [2]:
# Path to input documents
DATA_PATH1 = "fallout_content/PDFs"
DATA_PATH2 = "fallout_content/Nukapedia"
DATA_PATH3 = "fallout_content/YouTube_oxhorn"
DATA_PATH4 = "fallout_content/YouTube_spanish"

In [3]:
embeddings = OpenAIEmbeddings()
# prompt, model, parser
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
parser = StrOutputParser()

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

Splitting the information

In [4]:
# Load documents from directory
def load_documents(path, extension):
    loader = DirectoryLoader(path, glob=extension)
    return loader.load()

In [5]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # document = chunks[10]
    # print(document.page_content)
    # print(document.metadata)

    return chunks

In [6]:
# def vectors_pinecone(embeddings, chunks):
#     index_name = "fallout"
#     pinecone = PineconeVectorStore.from_documents(
#     chunks, embeddings, index_name=index_name
# )

In [7]:
def generate_data():
    documents = load_documents(DATA_PATH2, "*.txt")
    documents += load_documents(DATA_PATH1, "*.pdf")
    documents += load_documents(DATA_PATH3, "*.txt")
    documents += load_documents(DATA_PATH4, "*.txt")
    chunks = split_text(documents)
    return chunks

In [8]:
chunks = generate_data()

Split 462 documents into 71350 chunks.


In [11]:
from pinecone import Pinecone, PineconeException
try:
    pinecone = Pinecone(api_key=PINECONE_API_KEY, timeout=60)  # Adjust timeout as needed
    index_name = "fallout2"  # Define your Pinecone index name
    # Assuming PineconeVectorStore.from_documents is a valid method
    index = PineconeVectorStore.from_documents(chunks, embeddings, index_name=index_name)
    
    print(f"Successfully indexed documents in {index_name}")
except PineconeException as e:
    print(f"An error occurred with Pinecone: {e}")
    index = None  # Ensure index is None if there was an error
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    index = None  # Ensure index is None if there was an error



Successfully indexed documents in fallout2


In [12]:
chain = (
    {"context": index.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("Who is Harold?")

'Harold is a tree-like being who has undergone a spectacular mutation and goes by many names such as The Lord, Him, The One Who Grows, Gives, and Guides, and The Talking Tree.'

In [13]:
chain.invoke("How to make Curie human?")

'To make Curie human, you need to continue to talk to Mother Curie III until you have the option to ask her “What makes you think you’re right?” Curie will then recall a dream about the Prophet of Atom. You can pretend to be the Prophet of Atom, and if you are less than critically irradiated, she will not believe you. To convince Curie that you are the Prophet of Atom, you need to be critically irradiated.'

In [None]:
# Function to invoke the chain with a given question
def ask_question(question):
    return chain.invoke(question)

# Main loop
while True:
    # Get user input
    user_question = input("Please enter your question (or type 'exit' to quit): ")
    
    # Check if the user wants to exit the loop
    if user_question.lower() == 'exit':
        print("Exiting the question loop. Goodbye!")
        break
    
    # Invoke the chain with the user's question and print the result
    response = ask_question(user_question)
    print(response)

Please enter your question (or type 'exit' to quit):  who is Maximus?


Maximus is a rookie soldier who is a low-ranking member of the Brotherhood of Steel.


Please enter your question (or type 'exit' to quit):  who is the brotherhood of steel?


The Brotherhood of Steel is a quasi-religious technocratic military order founded by members of the United States Armed Forces and the government-sponsored scientific community.
