<a href="https://colab.research.google.com/github/RamachandraMurthy/Colabs-Repo/blob/master/Langchain_Directory_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Program by Ramachandra Murthy

03/28/2023

This is a perfect library which loads the files from a given directory. It goes in to recursive in to sub folders as well. This can be used as a initial program.

This is a Directory Loader from the Langchain github repository. This is alternate to my previous version.

In [None]:
!pip install -q langchain
!pip install -q unstructured
!pip install -q unstructured[local-inference]

In [None]:
"""Loading logic for loading documents from a directory."""
import logging
from pathlib import Path
from typing import List, Type, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader

FILE_LOADER_TYPE = Union[
    Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]
#logger = logging.getLogger(__file__) # Replaced _file_ to _nme as suggested by ChatGPT
logger = logging.getLogger(__name__)


def _is_visible(p: Path) -> bool:
    parts = p.parts
    for _p in parts:
        if _p.startswith("."):
            return False
    return True


class DirectoryLoader(BaseLoader):
    """Loading logic for loading documents from a directory."""

    def __init__(
        self,
        path: str,
        glob: str = "**/[!.]*",
        silent_errors: bool = False,
        load_hidden: bool = False,
        loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
        recursive: bool = False,
    ):
        """Initialize with path to directory and how to glob over it."""
        self.path = path
        self.glob = glob
        self.load_hidden = load_hidden
        self.loader_cls = loader_cls
        self.silent_errors = silent_errors
        self.recursive = recursive

    def load(self) -> List[Document]:
        """Load documents."""
        p = Path(self.path)
        docs = []
        items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
        for i in items:
            if i.is_file():
                if _is_visible(i.relative_to(p)) or self.load_hidden:
                    try:
                        sub_docs = self.loader_cls(str(i)).load()
                        docs.extend(sub_docs)
                    except Exception as e:
                        if self.silent_errors:
                            logger.warning(e)
                        else:
                            raise e
        return docs

    def list_files(self) -> List[str]:
        """List file names without loading documents."""
        p = Path(self.path)
        file_paths = []
        items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
        for i in items:
            if i.is_file():
                if _is_visible(i.relative_to(p)) or self.load_hidden:
                    file_paths.append(str(i))
        return file_paths


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/Data/"  # Replace with the path to your directory containing documents
glob_pattern = "**/[!.]*"  # You can change this pattern to match the files you want to load
recursive = True  # Set to True if you want to load files from subdirectories, False otherwise

loader = DirectoryLoader(path, glob=glob_pattern, recursive=recursive)


In [None]:
from pathlib import Path
from collections import defaultdict

# List file names without loading documents
file_names = loader.list_files()

# Sort file names based on file type (extension)
sorted_file_names = sorted(file_names, key=lambda x: Path(x).suffix)

# Generate summary of total files, file types, and their counts
file_type_counts = defaultdict(int)
for file_name in sorted_file_names:
    file_type = Path(file_name).suffix
    file_type_counts[file_type] += 1

total_files = len(sorted_file_names)
unique_file_types = len(file_type_counts)

# Print the summary
print(f"Total Files: {total_files}")
print(f"Unique File Types: {unique_file_types}")
print("File Type Counts:")
for file_type, count in file_type_counts.items():
    print(f"{file_type}: {count}")

# Print sorted file names one below the other
print("\nFile Names:")
for file_name in sorted_file_names:
    print(file_name)


In [None]:
docs = loader.load()
