Skip to content

Commit

Permalink
feat: Normalize file names in sync module (#2661)
Browse files Browse the repository at this point in the history
This commit adds a new utility function `remove_special_characters` to
the `normalize.py` module in the `sync/utils` directory. The function
removes special characters from file names by normalizing the input
string and using regular expressions to remove non-alphanumeric
characters.

The function is then used in the `list_files.py` module in the
`sync/utils` directory to normalize the names of files retrieved from
Google Drive and Azure Drive. This ensures that the file names are free
of special characters, improving consistency and compatibility with
other parts of the system.

Co-authored-by: Stan Girard <stan@quivr.app>
  • Loading branch information
StanGirard and StanGirard committed Jun 12, 2024
1 parent a04ceea commit 8e5af2c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
13 changes: 11 additions & 2 deletions backend/modules/sync/utils/list_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from google.auth.transport.requests import Request as GoogleRequest
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from modules.sync.utils.normalize import remove_special_characters
from logger import get_logger
from requests import HTTPError

Expand Down Expand Up @@ -53,6 +54,8 @@ def get_google_drive_files_by_id(credentials: dict, file_ids: List[str]):
)

logger.info("Google Drive files retrieved successfully: %s", len(files))
for file in files:
file["name"] = remove_special_characters(file["name"])
return files
except HTTPError as error:
logger.error("An error occurred while retrieving Google Drive files: %s", error)
Expand Down Expand Up @@ -138,6 +141,9 @@ def get_google_drive_files(
break

logger.info("Google Drive files retrieved successfully: %s", len(files))

for file in files:
file["name"] = remove_special_characters(file["name"])
return files
except HTTPError as error:
logger.error("An error occurred while retrieving Google Drive files: %s", error)
Expand Down Expand Up @@ -225,7 +231,8 @@ def fetch_files(endpoint, headers):
)

files.extend(folder_files)

for file in files:
file["name"] = remove_special_characters(file["name"])
logger.info("Azure Drive files retrieved successfully: %s", len(files))
return files

Expand Down Expand Up @@ -270,6 +277,8 @@ def get_azure_files_by_id(credentials: dict, file_ids: List[str]):
"mime_type": result.get("file", {}).get("mimeType", "folder"),
}
)


for file in files:
file["name"] = remove_special_characters(file["name"])
logger.info("Azure Drive files retrieved successfully: %s", len(files))
return files
15 changes: 15 additions & 0 deletions backend/modules/sync/utils/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import unicodedata
import re
from logger import get_logger

logger = get_logger(__name__)

def remove_special_characters(input):
try:
normalized_string = unicodedata.normalize('NFD', input)
normalized_string = re.sub(r'[^\w\s.]', '', normalized_string)
logger.info(f"Input: {input}, Normalized: {normalized_string}")
return normalized_string
except Exception as e:
logger.error(f"Error removing special characters: {e}")
return input

0 comments on commit 8e5af2c

Please sign in to comment.