<a href="https://colab.research.google.com/github/RamachandraMurthy/Colabs-Repo/blob/master/Certified_Langchain_Directory_loader_V2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Program by Ramachandra Murthy

03/28/2023

This is a perfect library which loads the files from a given directory. It goes in to recursive in to sub folders as well. This can be used as a initial program.

This is a Directory Loader from the Langchain github repository. This is alternate to my previous version.

In [None]:
!pip install -q langchain
!pip install -q unstructured
!pip install -q unstructured[local-inference]

# *Restart the runtime*

In [1]:
# Define CSVLoader class
from csv import DictReader
from typing import Dict, List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

class CSVLoader(BaseLoader):

    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        if csv_args is None:
            self.csv_args = {
                "delimiter": ",",
                "quotechar": '"',
            }
        else:
            self.csv_args = csv_args

    def load(self) -> List[Document]:
        docs = []

        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv = DictReader(csvfile, **self.csv_args)
            for i, row in enumerate(csv):
                content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
                if self.source_column is not None:
                    source = row[self.source_column]
                else:
                    source = self.file_path
                metadata = {"source": source, "row": i}
                doc = Document(page_content=content, metadata=metadata)
                docs.append(doc)

        return docs


In [2]:
# Import required libraries and define DirectoryLoader class
import logging
from pathlib import Path
from typing import List, Type, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader

FILE_LOADER_TYPE = Union[
    Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]

logger = logging.getLogger(__name__)

def _is_visible(p: Path) -> bool:
    parts = p.parts
    for _p in parts:
        if _p.startswith("."):
            return False
    return True

class DirectoryLoader(BaseLoader):
    """Loading logic for loading documents from a directory."""

    def __init__(
        self,
        path: str,
        glob: str = "**/[!.]*",
        silent_errors: bool = False,
        load_hidden: bool = False,
        loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
        recursive: bool = False,
    ):
        self.path = path
        self.glob = glob
        self.load_hidden = load_hidden
        self.loader_cls = loader_cls
        self.silent_errors = silent_errors
        self.recursive = recursive

    def load(self) -> List[Document]:
        p = Path(self.path)
        docs = []
        items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
        for i in items:
            if i.is_file():
                if _is_visible(i.relative_to(p)) or self.load_hidden:
                    try:
                        if i.suffix == ".csv":
                            sub_docs = CSVLoader(str(i)).load()
                        else:
                            sub_docs = self.loader_cls(str(i)).load()
                        docs.extend(sub_docs)
                    except Exception as e:
                        if self.silent_errors:
                            logger.warning(e)
                        else:
                            raise e
        return docs

    def list_files(self) -> List[str]:
        p = Path(self.path)
        file_paths = []
        items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
        for i in items:
            if i.is_file():
                if _is_visible(i.relative_to(p)) or self.load_hidden:
                    file_paths.append(str(i))
        return file_paths


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Initialize DirectoryLoader
path = "/content/drive/MyDrive/Colab Notebooks/Data/"
glob_pattern = "**/[!.]*"
recursive = True

loader = DirectoryLoader(path, glob=glob_pattern, recursive=recursive)

In [5]:
# List and sort file names, and print the summary
from pathlib import Path
from collections import defaultdict

file_names = loader.list_files()
sorted_file_names = sorted(file_names, key=lambda x: Path(x).suffix)
file_type_counts = defaultdict(int)
for file_name in sorted_file_names:
    file_type = Path(file_name).suffix
    file_type_counts[file_type] += 1

total_files = len(sorted_file_names)
unique_file_types = len(file_type_counts)

print(f"Total Files: {total_files}")
print(f"Unique File Types: {unique_file_types}")
print("File Type Counts:")
for file_type, count in file_type_counts.items():
    print(f"{file_type}: {count}")

print("\nFile Names:")
for file_name in sorted_file_names:
    print(file_name)


Total Files: 9
Unique File Types: 4
File Type Counts:
.csv: 1
.pdf: 4
.pptx: 1
.txt: 3

File Names:
/content/drive/MyDrive/Colab Notebooks/Data/Analytics Team.csv
/content/drive/MyDrive/Colab Notebooks/Data/DXC Time Entry Policy.pdf
/content/drive/MyDrive/Colab Notebooks/Data/Code of Conduct-English.pdf
/content/drive/MyDrive/Colab Notebooks/Data/DXC Information Security Policy.pdf
/content/drive/MyDrive/Colab Notebooks/Data/DXC IT Mobility Policy.pdf
/content/drive/MyDrive/Colab Notebooks/Data/Test.pptx
/content/drive/MyDrive/Colab Notebooks/Data/Get & Give feedback.txt
/content/drive/MyDrive/Colab Notebooks/Data/Annual Leave.txt
/content/drive/MyDrive/Colab Notebooks/Data/DXC Time.txt


In [6]:
# With the following code to load multiple files from the same directory:
data = []
file_data = loader.load()
data.extend(file_data)



In [7]:
# Chunk your data up into smaller documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print(f'Now you have {len(texts)} documents')


Now you have 3112 documents


In [8]:
!pip install -q pinecone-client
!pip install -q openai

from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
import pinecone
import openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 KB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h

  from tqdm.autonotebook import tqdm


In [9]:
# Set your OpenAI API key
openai.api_key = "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"  # Replace with your actual OpenAI API key

# Set your Pinecone API key and environment
pinecone.api_key = "9ded3df6-4867-42af-bcd6-56c365484778"  # Replace with your actual Pinecone API key
pinecone_api_env = "us-central1-gcp"  # Replace with your Pinecone environment


In [10]:
# Create embeddings of your documents to get ready for semantic search
# Please note the index name is from pinecone service
# Pinecone.io (https://app.pinecone.io)

embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
pinecone.init(api_key=pinecone.api_key, environment=pinecone_api_env)
index_name = "dxcpolicy"
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [11]:
# Load the QA chain with the OpenAI language model
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key=openai.api_key)
chain = load_qa_chain(llm, chain_type="stuff")

--------------------------------------

Test below before testing it thru gradio interface

In [12]:
query = "How many are listed under Ramachandra Murthy"
docs = docsearch.similarity_search(query, include_metadata=True)

In [13]:
chain.run(input_documents=docs, question=query)

' There are four people listed under Ramachandra Murthy: Ramachandra Murthy, Iris Bordey-Choi, Andre van Beuzekom, and Ericka May R Tolentino.'

--------------------------------------

In [14]:
!pip install -q gradio
!pip install --upgrade pillow


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.1/144.1 KB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 KB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ffmpy (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import gradio as gr

In [16]:
def semantic_search_qa(query):
    if query:
        # Perform a similarity search on the document embeddings
        docs = docsearch.similarity_search(query, include_metadata=True)

        # Run the QA chain with the found documents and the query
        answer = chain.run(input_documents=docs, question=query)
        return answer

# Create Gradio interface for the semantic search and question-answering process
def launch_gradio_interface():
    inputs = gr.inputs.Textbox(lines=7, label="Ask a question")
    outputs = gr.outputs.Textbox(label="Answer")

    gr.Interface(fn=semantic_search_qa, inputs=inputs, outputs=outputs, title="DXC Policy Search and QA by Ramachandra Murthy",
                 description="Ask any DXC policy question",
                 theme="compact").launch(share=True)

In [17]:
# Call the Gradio interface function
launch_gradio_interface()



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://26476e75105f444b16.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces
