In [None]:
# Import required libraries for document processing and AI interactions

import os
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# the model I am going to use
MODEL = "gpt-4o-mini"

#the vector database name
db_name = "personal_vector_db"

In [None]:
# Load API key from .env file and ensure it is set in the environment
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
#Get a list of all files and folders inside 'personal-knowledge-base/'
folders = glob.glob("personal-knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}
#if this doesn't work as it should
# text_loader_kwargs={'autodetect_encoding': True}

In [None]:

documents = []
for folder in folders:
     # Get the folder name as the document type
    doc_type = os.path.basename(folder)
    # Load all Markdown files from the folder
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    #adds metadata to the documents
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])


In [None]:
# Split text into chunks of 1000 characters with 200-character overlap
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

In [None]:
embeddings = OpenAIEmbeddings()
# if vector base exists, delete it and make it again
if os.path.exists(db_name):
    #deleting the vector base
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
#creating the vector base
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Initialize the language model with a specified temperature for response variability
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Create a memory buffer to store chat history for conversational context

# I am using this, but it will get depricated soon, memory_key='chat_history'
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# Convert the vector store into a retriever for document search
retriever = vectorstore.as_retriever()

# Create a conversational chain that integrates the language model, retriever, and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:

#this is to see that the code is working
query = "what tasks do I have to do today?"# I gave it a list of my tasks for today, so that there would be something easy to test it with
result = conversation_chain.invoke({"question": query})
print(result["answer"])



In [None]:
#clear conversation history
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# chat function is made, because Gradio requires function of type chat(question, history) to make a chat interface
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
# Launching Gradio in a new browser window
gr.ChatInterface(chat, type="messages").launch(inbrowser=True)