Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add URL to Brain #26

Merged
merged 3 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 58 additions & 28 deletions files.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,75 @@
import streamlit as st
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
import os
from loaders.audio import process_audio
from loaders.txt import process_txt
from loaders.csv import process_csv
from loaders.markdown import process_markdown
from loaders.html import process_html
from utils import compute_sha1_from_content
from loaders.pdf import process_pdf
from loaders.html import get_html, create_html_file, delete_tempfile
import requests
import re
import unicodedata
import tempfile

def file_uploader(supabase, openai_key, vector_store):
file_processors = {
".txt": process_txt,
".csv": process_csv,
".md": process_markdown,
".markdown": process_markdown,
".m4a": process_audio,
".mp3": process_audio,
".webm": process_audio,
".mp4": process_audio,
".mpga": process_audio,
".wav": process_audio,
".mpeg": process_audio,
".pdf": process_pdf,
}
file_processors = {
".txt": process_txt,
".csv": process_csv,
".md": process_markdown,
".markdown": process_markdown,
".m4a": process_audio,
".mp3": process_audio,
".webm": process_audio,
".mp4": process_audio,
".mpga": process_audio,
".wav": process_audio,
".mpeg": process_audio,
".pdf": process_pdf,
".html": process_html,
}

files = st.file_uploader("Upload a file", accept_multiple_files=True, type=list(file_processors.keys()))
def file_uploader(supabase, openai_key, vector_store):
files = st.file_uploader("**Upload a file**", accept_multiple_files=True, type=list(file_processors.keys()))
if st.button("Add to Database"):
if files is not None:
for file in files:
if file_already_exists(supabase, file):
st.write(f"😎 {file.name} is already in the database.")
elif file.size < 1:
st.write(f"💨 {file.name} is empty.")
else:
file_extension = os.path.splitext(file.name)[-1]
if file_extension in file_processors:
file_processors[file_extension](vector_store, file)
st.write(f"✅ {file.name} ")
else:
st.write(f"❌ {file.name} is not a valid file type.")
filter_file(file, supabase, vector_store)

def file_already_exists(supabase, file):
file_sha1 = compute_sha1_from_content(file.getvalue())
response = supabase.table("documents").select("id").eq("metadata->>file_sha1", file_sha1).execute()
return len(response.data) > 0
return len(response.data) > 0

def filter_file(file, supabase, vector_store):
if file_already_exists(supabase, file):
st.write(f"😎 {file.name} is already in the database.")
return False
elif file.size < 1:
st.write(f"💨 {file.name} is empty.")
return False
else:
file_extension = os.path.splitext(file.name)[-1]
print(file.name, file_extension)
if file_extension in file_processors:
file_processors[file_extension](vector_store, file)
st.write(f"✅ {file.name} ")
return True
else:
st.write(f"❌ {file.name} is not a valid file type.")
return False

def url_uploader(supabase, openai_key, vector_store):
url = st.text_area("## Add an url",placeholder="https://www.quivr.app")
button = st.button("Add the URL to the database")
if button:
html = get_html(url)
if html:
st.write(f"Getting content ... {url} ")
file, temp_file_path = create_html_file(url, html)
ret = filter_file(file, supabase, vector_store)
delete_tempfile(temp_file_path, url, ret)
else:
st.write(f"❌ Failed to access to {url} .")

47 changes: 47 additions & 0 deletions loaders/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from .common import process_file
from langchain.document_loaders import UnstructuredHTMLLoader
import requests
import re
import unicodedata
import tempfile
import os
import streamlit as st
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile

def process_html(vector_store, file):
return process_file(vector_store, file, UnstructuredHTMLLoader, ".html")


def get_html(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None

def create_html_file(url, content):
file_name = slugify(url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, 'w') as temp_file:
temp_file.write(content)

record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
uploaded_file = UploadedFile(record)

return uploaded_file, temp_file_path

def delete_tempfile(temp_file_path, url, ret):
try:
os.remove(temp_file_path)
if ret:
st.write(f"✅ Content saved... {url} ")
except OSError as e:
print(f"Error while deleting the temporary file: {str(e)}")
if ret:
st.write(f"❌ Error while saving content... {url} ")

def slugify(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
text = re.sub(r'[^\w\s-]', '', text).strip().lower()
text = re.sub(r'[-\s]+', '-', text)
return text
11 changes: 9 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import tempfile

import streamlit as st
from files import file_uploader
from files import file_uploader, url_uploader
from question import chat_with_doc
from brain import brain
from langchain.embeddings.openai import OpenAIEmbeddings
Expand Down Expand Up @@ -63,7 +63,14 @@
"Select Chunk Size", 100, 1000, st.session_state['chunk_size'], 50)
st.session_state['chunk_overlap'] = st.sidebar.slider(
"Select Chunk Overlap", 0, 100, st.session_state['chunk_overlap'], 10)
file_uploader(supabase, openai_api_key, vector_store)

# Create two columns for the file uploader and URL uploader
col1, col2 = st.columns(2)

with col1:
file_uploader(supabase, openai_api_key, vector_store)
with col2:
url_uploader(supabase, openai_api_key, vector_store)
elif user_choice == 'Chat with your Brain':
# Display model and temperature selection only when asking questions
st.sidebar.title("Configuration")
Expand Down