In [26]:
import os

# Define the project root directory
project_root = os.getcwd()  # Ensure the Jupyter Notebook is running in the project root

# Function to write content to a file
def write_file(relative_path, content):
    """
    Writes the given content to the file at the relative path.
    
    Args:
        relative_path (str): Relative path to the file from the project root.
        content (str): Content to write into the file.
    """
    file_path = os.path.join(project_root, relative_path)
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"✅ Successfully updated: {relative_path}")
    except Exception as e:
        print(f"❌ Failed to update {relative_path}: {e}")

# 1. Update src/detection/config.py
config_py_content = '''\
# src/detection/config.py

import os

# Project Root Directory
CURRENT_FILE = os.path.abspath(__file__)
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(CURRENT_FILE), "../../"))

# Data Directories
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
SAMPLE_DOCUMENTS_DIR = os.path.join(DATA_DIR, "sample_documents")

# Logs Configuration
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
DETECTION_LOG_FILE = os.path.join(LOG_DIR, "detection_errors.log")
PERFORMANCE_LOG_FILE = os.path.join(LOG_DIR, "performance_logs", "performance_metrics.log")

# Supported File Formats
SUPPORTED_FORMATS = (".pdf", ".docx")

# TOC Detection Regex Patterns
PDF_TOC_REGEX = r"\\d+(\\.\\d+)*\\s+[\\w\\s]+(\\.\\.\\.\\d+)?"
DOCX_TOC_REGEX = r"\\d+(\\.\\d+)*\\s+[\\w\\s]+(\\.\\.\\.\\d+)?"

# Logging Settings
LOGGING_LEVEL = "INFO"
LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"

# Git Commit Message
GIT_COMMIT_MESSAGE = "Processed multiple sample files for TOC detection"
'''
write_file('src/detection/config.py', config_py_content)

# 2. Update src/detection/exceptions.py
exceptions_py_content = '''\
# src/detection/exceptions.py

class TOCDetectionError(Exception):
    """Base exception for TOC detection errors."""
    pass

class UnsupportedFileFormatError(TOCDetectionError):
    """Exception raised for unsupported file formats."""
    pass

class FileProcessingError(TOCDetectionError):
    """Exception raised when file processing fails."""
    pass
'''
write_file('src/detection/exceptions.py', exceptions_py_content)

# 3. Update src/detection/utils.py
detection_utils_py_content = '''\
# src/detection/utils.py

import os
import logging
from .config import LOG_DIR, DETECTION_LOG_FILE, LOGGING_LEVEL, LOGGING_FORMAT
from .exceptions import UnsupportedFileFormatError

def setup_logging():
    """
    Sets up logging with both file and console handlers.
    Ensures that the log directory exists.
    """
    os.makedirs(LOG_DIR, exist_ok=True)
    os.makedirs(os.path.dirname(DETECTION_LOG_FILE), exist_ok=True)
    
    logger = logging.getLogger()
    logger.setLevel(getattr(logging, LOGGING_LEVEL.upper(), logging.INFO))
    
    # File Handler
    file_handler = logging.FileHandler(DETECTION_LOG_FILE)
    file_formatter = logging.Formatter(LOGGING_FORMAT)
    file_handler.setFormatter(file_formatter)
    logger.addHandler(file_handler)
    
    # Console Handler
    console_handler = logging.StreamHandler()
    console_formatter = logging.Formatter(LOGGING_FORMAT)
    console_handler.setFormatter(console_formatter)
    logger.addHandler(console_handler)

def validate_file_path(file_path, supported_formats):
    """
    Validates if the file exists and is of a supported format.

    Args:
        file_path (str): Path to the file.
        supported_formats (tuple): Supported file extensions.

    Raises:
        FileNotFoundError: If the file does not exist.
        UnsupportedFileFormatError: If the file format is unsupported.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    if not file_path.lower().endswith(supported_formats):
        raise UnsupportedFileFormatError(f"Unsupported file format: {file_path}")
'''
write_file('src/detection/utils.py', detection_utils_py_content)

# 4. Update src/detection/detect_toc.py
detect_toc_py_content = '''\
# src/detection/detect_toc.py

import os
import re
import logging
from PyPDF2 import PdfReader
from docx import Document
from .config import (
    PROJECT_ROOT,
    SUPPORTED_FORMATS,
    PDF_TOC_REGEX,
    DOCX_TOC_REGEX
)
from .exceptions import (
    TOCDetectionError,
    UnsupportedFileFormatError,
    FileProcessingError
)
from .utils import validate_file_path

def detect_toc(file_path):
    """
    Detects a Table of Contents (TOC) in a document.

    Args:
        file_path (str): Path to the document.

    Returns:
        dict: Detected TOC structure and metadata.
    """
    logging.info(f"Starting TOC detection for file: {file_path}")
    try:
        validate_file_path(file_path, SUPPORTED_FORMATS)
        ext = os.path.splitext(file_path)[1].lower()
        if ext == ".pdf":
            result = detect_toc_pdf(file_path)
        elif ext == ".docx":
            result = detect_toc_docx(file_path)
        else:
            raise UnsupportedFileFormatError("Only PDF and DOCX formats are supported.")
        logging.info(f"TOC detection completed successfully for {file_path}")
        return result
    except TOCDetectionError as e:
        logging.error(f"TOCDetectionError: {e}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error during TOC detection for {file_path}: {e}")
        raise FileProcessingError(f"Failed to process {file_path}") from e

def detect_toc_pdf(file_path):
    """Detects TOC patterns in a PDF file."""
    toc_structure = []
    try:
        reader = PdfReader(file_path)
        for page_number, page in enumerate(reader.pages, start=1):
            text = page.extract_text()
            if text:
                matches = re.findall(PDF_TOC_REGEX, text)
                # Flatten the matches if regex has groups
                flat_matches = [".".join(filter(None, match)).strip() for match in matches]
                toc_structure.extend(flat_matches)
            else:
                logging.warning(f"No text found on page {page_number} of {file_path}")
        return {"format": "PDF", "toc_structure": toc_structure}
    except Exception as e:
        logging.error(f"Error processing PDF file {file_path}: {e}")
        raise FileProcessingError(f"Error processing PDF file {file_path}") from e

def detect_toc_docx(file_path):
    """Detects TOC patterns in a DOCX file."""
    toc_structure = []
    try:
        doc = Document(file_path)
        for para_number, paragraph in enumerate(doc.paragraphs, start=1):
            matches = re.findall(DOCX_TOC_REGEX, paragraph.text)
            flat_matches = [".".join(filter(None, match)).strip() for match in matches]
            toc_structure.extend(flat_matches)
        return {"format": "DOCX", "toc_structure": toc_structure}
    except Exception as e:
        logging.error(f"Error processing DOCX file {file_path}: {e}")
        raise FileProcessingError(f"Error processing DOCX file {file_path}") from e
'''
write_file('src/detection/detect_toc.py', detect_toc_py_content)

# 5. Update src/utils/git_utils.py
git_utils_py_content = '''\
# src/utils/git_utils.py

import subprocess
import logging

def auto_commit_push(commit_message):
    """
    Automatically commits and pushes changes to the Git repository.

    Args:
        commit_message (str): The commit message.

    Raises:
        subprocess.CalledProcessError: If Git commands fail.
    """
    try:
        logging.info("Staging changes for commit...")
        subprocess.run(["git", "add", "."], check=True)
        
        logging.info("Committing changes...")
        subprocess.run(["git", "commit", "-m", commit_message], check=True)
        
        logging.info("Pushing to remote repository...")
        subprocess.run(["git", "push"], check=True)
        
        logging.info("Git commit and push completed successfully.")
    except subprocess.CalledProcessError as e:
        logging.error(f"Git command failed: {e}")
        raise
'''
write_file('src/utils/git_utils.py', git_utils_py_content)

print("\n🎉 All specified files have been updated successfully.")


✅ Successfully updated: src/detection/config.py
✅ Successfully updated: src/detection/exceptions.py
✅ Successfully updated: src/detection/utils.py
✅ Successfully updated: src/detection/detect_toc.py
✅ Successfully updated: src/utils/git_utils.py

🎉 All specified files have been updated successfully.
