fix: sync creation fixed (#2637)

This pull request includes updates to the `docker-compose.dev.yml` and `Dockerfile.dev` files. The changes aim to improve performance and fix bugs. The updates include: - Removing unnecessary workers configuration in the `docker-compose.dev.yml` file. - Updating the base image in the `Dockerfile.dev` to use a slim version. - Adjusting the schedule for a specific task in the code. - Modifying the time interval for retrieving active syncs. - Changing the loader class for processing PowerPoint files. - Refactoring the file existence check logic. - Adding debug logs for file existence check and file removal. - Adjusting the file synchronization logic. These changes are intended to enhance the performance and stability of the application.
QuivrHQ · Jun 6, 2024 · 31d3cce · 31d3cce
1 parent 6ea4a45
commit 31d3cce
Show file tree

Hide file tree

Showing 9 changed files with 914 additions and 853 deletions.
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev
@@ -1,5 +1,5 @@
 # Using a slim version for a smaller base image
-FROM python:3.11.6-slim-bullseye@sha256:0c1fbb294096d842ad795ee232d783cab436c90b034210fe894f2bb2f2be7626
+FROM python:3.11.6-slim-bullseye
 
 ARG DEV_MODE
 ENV DEV_MODE=$DEV_MODE

diff --git a/backend/celery_worker.py b/backend/celery_worker.py
@@ -218,6 +218,6 @@ def process_integration_brain_sync():
     },
     "process_sync_active": {
         "task": "process_sync_active",
-        "schedule": crontab(minute="*/5", hour="*"),
+        "schedule": crontab(minute="*/1", hour="*"),
     },
 }
diff --git a/backend/modules/sync/utils/googleutils.py b/backend/modules/sync/utils/googleutils.py
@@ -57,18 +57,20 @@ async def _upload_files(
             logger.info("Google Drive credentials refreshed")
             # Updating the credentials in the database
 
-        try:
-            service = build("drive", "v3", credentials=creds)
-            downloaded_files = []
-            for file in files:
+        service = build("drive", "v3", credentials=creds)
+        downloaded_files = []
+        for file in files:
+            logger.info("🔥🔥🔥🔥: %s", file)
+            try:
                 file_id = file["id"]
                 file_name = file["name"]
                 mime_type = file["mime_type"]
                 modified_time = file["last_modified"]
                 # Convert Google Docs files to appropriate formats before downloading
                 if mime_type == "application/vnd.google-apps.document":
                     logger.debug(
-                        "Converting Google Docs file with file_id: %s to DOCX.", file_id
+                        "Converting Google Docs file with file_id: %s to DOCX.",
+                        file_id,
                     )
                     request = service.files().export_media(
                         fileId=file_id,
@@ -119,7 +121,7 @@ async def _upload_files(
 
                 # Check if the file already exists in the storage
                 if check_file_exists(brain_id, file_name):
-                    logger.info("🔥 File already exists in the storage: %s", file_name)
+                    logger.debug("🔥 File already exists in the storage: %s", file_name)
 
                     self.storage.remove_file(brain_id + "/" + file_name)
                     BrainsVectors().delete_file_from_brain(brain_id, file_name)
@@ -129,7 +131,7 @@ async def _upload_files(
                     filename=file_name,
                 )
 
-                await upload_file(to_upload_file, brain_id, current_user)
+                await upload_file(to_upload_file, brain_id, current_user)  # type: ignore
 
                 # Check if the file already exists in the database
                 existing_files = self.sync_files_repo.get_sync_files(sync_active_id)
@@ -156,13 +158,13 @@ async def _upload_files(
                         )
                     )
 
-                downloaded_files.append(file_name)
-            return {"downloaded_files": downloaded_files}
-        except HttpError as error:
-            logger.error(
-                "An error occurred while downloading Google Drive files: %s", error
-            )
-            return {"error": f"An error occurred: {error}"}
+                    downloaded_files.append(file_name)
+            except HttpError as error:
+                logger.error(
+                    "An error occurred while downloading Google Drive files: %s",
+                    error,
+                )
+        return {"downloaded_files": downloaded_files}
 
     async def sync(self, sync_active_id: int, user_id: str):
         """
@@ -244,29 +246,29 @@ async def sync(self, sync_active_id: int, user_id: str):
 
         # Filter files that have been modified since the last sync
         last_synced_time = datetime.fromisoformat(last_synced) if last_synced else None
+
         files_to_download = [
             file
             for file in files.get("files", [])
             if not file["is_folder"]
             and (
-                not last_synced_time
-                or datetime.fromisoformat(file["last_modified"]) > last_synced_time
+                (
+                    not last_synced_time
+                    or datetime.fromisoformat(file["last_modified"]) > last_synced_time
+                )
+                or not check_file_exists(sync_active["brain_id"], file["name"])
             )
         ]
 
+        logger.error(files_to_download)
+
         downloaded_files = await self._upload_files(
             sync_user["credentials"],
             files_to_download,
             user_id,
             sync_active["brain_id"],
             sync_active_id,
         )
-        if "error" in downloaded_files:
-            logger.error(
-                "Failed to download files from Google Drive for sync_active_id: %s",
-                sync_active_id,
-            )
-            return None
 
         # Update the last_synced timestamp
         self.sync_active_service.update_sync_active(

diff --git a/backend/modules/sync/utils/sharepointutils.py b/backend/modules/sync/utils/sharepointutils.py
@@ -103,7 +103,7 @@ async def _upload_files(
 
                 # Check if the file already exists in the storage
                 if check_file_exists(brain_id, file_name):
-                    logger.info("🔥 File already exists in the storage: %s", file_name)
+                    logger.debug("🔥 File already exists in the storage: %s", file_name)
 
                     self.storage.remove_file(brain_id + "/" + file_name)
                     BrainsVectors().delete_file_from_brain(brain_id, file_name)
@@ -250,11 +250,14 @@ async def sync(self, sync_active_id: int, user_id: str):
             for file in files.get("files", [])
             if not file["is_folder"]
             and (
-                not last_synced_time
-                or datetime.strptime(
-                    file["last_modified"], "%Y-%m-%dT%H:%M:%SZ"
-                ).replace(tzinfo=timezone.utc)
-                > last_synced_time
+                (
+                    not last_synced_time
+                    or datetime.strptime(
+                        file["last_modified"], "%Y-%m-%dT%H:%M:%SZ"
+                    ).replace(tzinfo=timezone.utc)
+                    > last_synced_time
+                )
+                or not check_file_exists(sync_active["brain_id"], file["name"])
             )
         ]
 

diff --git a/backend/modules/upload/service/upload_file.py b/backend/modules/upload/service/upload_file.py
@@ -49,8 +49,12 @@ def check_file_exists(brain_id: str, file_identifier: str) -> bool:
         response = supabase_client.storage.from_("quivr").list(brain_id)
 
         # Check if the file_identifier is in the response
-        file_exists = any(file["name"] == file_identifier for file in response)
-
+        file_exists = any(
+            file["name"].split(".")[0] == file_identifier.split(".")[0]
+            for file in response
+        )
+        logger.info(f"File identifier: {file_identifier}")
+        logger.info(f"File exists: {file_exists}")
         if file_exists:
             logger.info(f"File {file_identifier} exists.")
             return True
@@ -59,7 +63,7 @@ def check_file_exists(brain_id: str, file_identifier: str) -> bool:
             return False
     except Exception as e:
         logger.error(f"An error occurred while checking the file: {e}")
-        raise e
+        return True
 
 
 def upload_file_storage(file, file_identifier: str, upsert: str = "false"):

diff --git a/backend/packages/files/parsers/powerpoint.py b/backend/packages/files/parsers/powerpoint.py
@@ -1,4 +1,4 @@
-from langchain_community.document_loaders import UnstructuredPowerPointLoader
+from langchain_community.document_loaders import UnstructuredFileLoader
 from models import File
 
 from .common import process_file
@@ -9,7 +9,7 @@ def process_powerpoint(
 ):
     return process_file(
         file=file,
-        loader_class=UnstructuredPowerPointLoader,
+        loader_class=UnstructuredFileLoader,
         brain_id=brain_id,
         original_file_name=original_file_name,
         integration=integration,