Skip to content

Commit

Permalink
fix: sync creation fixed (#2637)
Browse files Browse the repository at this point in the history
This pull request includes updates to the `docker-compose.dev.yml` and
`Dockerfile.dev` files. The changes aim to improve performance and fix
bugs. The updates include:

- Removing unnecessary workers configuration in the
`docker-compose.dev.yml` file.

- Updating the base image in the `Dockerfile.dev` to use a slim version.

- Adjusting the schedule for a specific task in the code.

- Modifying the time interval for retrieving active syncs.

- Changing the loader class for processing PowerPoint files.

- Refactoring the file existence check logic.

- Adding debug logs for file existence check and file removal.

- Adjusting the file synchronization logic.

These changes are intended to enhance the performance and stability of
the application.
  • Loading branch information
StanGirard committed Jun 6, 2024
1 parent 6ea4a45 commit 31d3cce
Show file tree
Hide file tree
Showing 9 changed files with 914 additions and 853 deletions.
1,552 changes: 803 additions & 749 deletions Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion backend/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Using a slim version for a smaller base image
FROM python:3.11.6-slim-bullseye@sha256:0c1fbb294096d842ad795ee232d783cab436c90b034210fe894f2bb2f2be7626
FROM python:3.11.6-slim-bullseye

ARG DEV_MODE
ENV DEV_MODE=$DEV_MODE
Expand Down
2 changes: 1 addition & 1 deletion backend/celery_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,6 @@ def process_integration_brain_sync():
},
"process_sync_active": {
"task": "process_sync_active",
"schedule": crontab(minute="*/5", hour="*"),
"schedule": crontab(minute="*/1", hour="*"),
},
}
46 changes: 24 additions & 22 deletions backend/modules/sync/utils/googleutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,20 @@ async def _upload_files(
logger.info("Google Drive credentials refreshed")
# Updating the credentials in the database

try:
service = build("drive", "v3", credentials=creds)
downloaded_files = []
for file in files:
service = build("drive", "v3", credentials=creds)
downloaded_files = []
for file in files:
logger.info("🔥🔥🔥🔥: %s", file)
try:
file_id = file["id"]
file_name = file["name"]
mime_type = file["mime_type"]
modified_time = file["last_modified"]
# Convert Google Docs files to appropriate formats before downloading
if mime_type == "application/vnd.google-apps.document":
logger.debug(
"Converting Google Docs file with file_id: %s to DOCX.", file_id
"Converting Google Docs file with file_id: %s to DOCX.",
file_id,
)
request = service.files().export_media(
fileId=file_id,
Expand Down Expand Up @@ -119,7 +121,7 @@ async def _upload_files(

# Check if the file already exists in the storage
if check_file_exists(brain_id, file_name):
logger.info("🔥 File already exists in the storage: %s", file_name)
logger.debug("🔥 File already exists in the storage: %s", file_name)

self.storage.remove_file(brain_id + "/" + file_name)
BrainsVectors().delete_file_from_brain(brain_id, file_name)
Expand All @@ -129,7 +131,7 @@ async def _upload_files(
filename=file_name,
)

await upload_file(to_upload_file, brain_id, current_user)
await upload_file(to_upload_file, brain_id, current_user) # type: ignore

# Check if the file already exists in the database
existing_files = self.sync_files_repo.get_sync_files(sync_active_id)
Expand All @@ -156,13 +158,13 @@ async def _upload_files(
)
)

downloaded_files.append(file_name)
return {"downloaded_files": downloaded_files}
except HttpError as error:
logger.error(
"An error occurred while downloading Google Drive files: %s", error
)
return {"error": f"An error occurred: {error}"}
downloaded_files.append(file_name)
except HttpError as error:
logger.error(
"An error occurred while downloading Google Drive files: %s",
error,
)
return {"downloaded_files": downloaded_files}

async def sync(self, sync_active_id: int, user_id: str):
"""
Expand Down Expand Up @@ -244,29 +246,29 @@ async def sync(self, sync_active_id: int, user_id: str):

# Filter files that have been modified since the last sync
last_synced_time = datetime.fromisoformat(last_synced) if last_synced else None

files_to_download = [
file
for file in files.get("files", [])
if not file["is_folder"]
and (
not last_synced_time
or datetime.fromisoformat(file["last_modified"]) > last_synced_time
(
not last_synced_time
or datetime.fromisoformat(file["last_modified"]) > last_synced_time
)
or not check_file_exists(sync_active["brain_id"], file["name"])
)
]

logger.error(files_to_download)

downloaded_files = await self._upload_files(
sync_user["credentials"],
files_to_download,
user_id,
sync_active["brain_id"],
sync_active_id,
)
if "error" in downloaded_files:
logger.error(
"Failed to download files from Google Drive for sync_active_id: %s",
sync_active_id,
)
return None

# Update the last_synced timestamp
self.sync_active_service.update_sync_active(
Expand Down
15 changes: 9 additions & 6 deletions backend/modules/sync/utils/sharepointutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ async def _upload_files(

# Check if the file already exists in the storage
if check_file_exists(brain_id, file_name):
logger.info("🔥 File already exists in the storage: %s", file_name)
logger.debug("🔥 File already exists in the storage: %s", file_name)

self.storage.remove_file(brain_id + "/" + file_name)
BrainsVectors().delete_file_from_brain(brain_id, file_name)
Expand Down Expand Up @@ -250,11 +250,14 @@ async def sync(self, sync_active_id: int, user_id: str):
for file in files.get("files", [])
if not file["is_folder"]
and (
not last_synced_time
or datetime.strptime(
file["last_modified"], "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=timezone.utc)
> last_synced_time
(
not last_synced_time
or datetime.strptime(
file["last_modified"], "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=timezone.utc)
> last_synced_time
)
or not check_file_exists(sync_active["brain_id"], file["name"])
)
]

Expand Down
10 changes: 7 additions & 3 deletions backend/modules/upload/service/upload_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,12 @@ def check_file_exists(brain_id: str, file_identifier: str) -> bool:
response = supabase_client.storage.from_("quivr").list(brain_id)

# Check if the file_identifier is in the response
file_exists = any(file["name"] == file_identifier for file in response)

file_exists = any(
file["name"].split(".")[0] == file_identifier.split(".")[0]
for file in response
)
logger.info(f"File identifier: {file_identifier}")
logger.info(f"File exists: {file_exists}")
if file_exists:
logger.info(f"File {file_identifier} exists.")
return True
Expand All @@ -59,7 +63,7 @@ def check_file_exists(brain_id: str, file_identifier: str) -> bool:
return False
except Exception as e:
logger.error(f"An error occurred while checking the file: {e}")
raise e
return True


def upload_file_storage(file, file_identifier: str, upsert: str = "false"):
Expand Down
4 changes: 2 additions & 2 deletions backend/packages/files/parsers/powerpoint.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from models import File

from .common import process_file
Expand All @@ -9,7 +9,7 @@ def process_powerpoint(
):
return process_file(
file=file,
loader_class=UnstructuredPowerPointLoader,
loader_class=UnstructuredFileLoader,
brain_id=brain_id,
original_file_name=original_file_name,
integration=integration,
Expand Down
Loading

0 comments on commit 31d3cce

Please sign in to comment.