In [1]:
from llama_index import(
    ServiceContext,
    StorageContext,
    SimpleDirectoryReader,
    LangchainEmbedding,
    VectorStoreIndex,
    load_index_from_storage,
    load_graph_from_storage,
    LLMPredictor,
    PromptHelper
    )

# upload model
from llama_index.llms import LangChainLLM
from llama_index.graph_stores import SimpleGraphStore
from llama_index import (KnowledgeGraphIndex)
from llama_index.storage.storage_context import StorageContext
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [2]:
from typing import Callable, Dict, Generator, List, Optional, Type
from pathlib import Path
import logging


logger = logging.getLogger(__name__)
class DirectorySearchSource():
    def __init__(
    self,
    num_files_limit: Optional[int] = None,
    exclude_hidden: bool = True,
    required_exts: Optional[List[str]]  = None,
    recursive : bool = True,):
    
        super().__init__()

        self.recursive = recursive
        self.exclude_hidden = exclude_hidden
        self.required_exts = required_exts
        self.num_files_limit = num_files_limit

    def add_files(self, input_dir):
        all_files = set()
        rejected_files = set()
        list_files = []

        file_refs: Generator[Path, None, None]
        if self.recursive:
            file_refs = Path(input_dir).rglob("*")
        else:
            file_refs = Path(input_dir).glob("*")
        for ref in file_refs:
            # Manually check if file is hidden or directory instead of
            # in glob for backwards compatibility.
            is_dir = ref.is_dir()
            skip_because_hidden = self.exclude_hidden and ref.name.startswith(".")
            skip_because_bad_ext = (
                self.required_exts is not None and ref.suffix not in self.required_exts
            )
            skip_because_excluded = ref in rejected_files

            if (
                is_dir
                or skip_because_hidden
                or skip_because_bad_ext
                or skip_because_excluded
            ):
                continue
            else:
                all_files.add(ref)
        new_input_files = sorted(list(all_files))

        if len(new_input_files) == 0:
            raise ValueError(f"No files found in {input_dir}.")

        if self.num_files_limit is not None and self.num_files_limit > 0:
            new_input_files = new_input_files[0 : self.num_files_limit]

        # print total number of files added
        logger.debug(
            f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}")

        for f in new_input_files:
            list_files.append(str(f))
        return list_files

In [4]:
from llama_index.readers.base import BaseReader
from llama_index.schema import Document

class HtmlFilesReader(BaseReader):
    """Simple web page reader.

    Reads pages from the web.

    Args:
        html_to_text (bool): Whether to convert HTML to text.
            Requires `html2text` package.

    """

    def __init__(self, html_to_text: bool = False):
        """Initialize with parameters."""
        try:
            import html2text  # noqa: F401
        except ImportError:
            raise ImportError(
                "`html2text` package not found, please run `pip install html2text`"
            )
        self._html_to_text = html_to_text

    def load_data(self, input_files):
        """Load data from the input directory.

        Args:
            urls (List[str]): List of URLs to scrape.

        Returns:
            List[Document]: List of documents.

        """
        if not isinstance(input_files, list):
            raise ValueError("input_files must be a list of strings.")
        documents = []
        for input_file in input_files:
            # if file_metadata is not None:
            #     metadata = file_metadata(str(input_file))
            #response = requests.get(url, headers=None).text
            with open(input_file, "r", errors = "ignore", encoding='utf-8') as f:
                response = f.read()
            if self._html_to_text:
                import html2text

                response = html2text.html2text(response)

            doc = Document(text=response)
            doc.metadata = {'file_name': input_file}


            documents.append(doc)

        return documents

In [11]:
def clean_duplicate(documents):
    content_unique = []
    index_unique = []
    content_duplicate = []
    index_duplicate = []
    for index, doc in enumerate(documents):
        if doc.text not in content_unique:
            content_unique.append(doc.text)
            index_unique.append(index)
        else :
            content_duplicate.append(doc.text)
            index_duplicate.append(index)
    documents_clean = [item for index, item in enumerate(documents) if index in index_unique]
    return documents_clean

In [12]:
input_dir = "./omniscien.com"
filename_fn = lambda filename: {'file_name': filename}
lists_files = DirectorySearchSource().add_files(input_dir)
documents = HtmlFilesReader(html_to_text=True).load_data(input_files = lists_files)
documents_clean = clean_duplicate(documents)

In [19]:
len(documents)

169

In [15]:
len(documents_clean)

131

In [18]:
for i in documents_clean:
    print(i.metadata)

{'file_name': 'omniscien.com/about-us/careers/index.html'}
{'file_name': 'omniscien.com/about-us/company/index.html'}
{'file_name': 'omniscien.com/about-us/contact-us/index.html'}
{'file_name': 'omniscien.com/about-us/index.html'}
{'file_name': 'omniscien.com/about-us/internships/index.html'}
{'file_name': 'omniscien.com/about-us/legal/gdpr/index.html'}
{'file_name': 'omniscien.com/about-us/legal/index.html'}
{'file_name': 'omniscien.com/about-us/legal/privacy-policy/index.html'}
{'file_name': 'omniscien.com/about-us/news/index.html'}
{'file_name': 'omniscien.com/about-us/office-locations/index.html'}
{'file_name': 'omniscien.com/blog/ai-in-the-age-of-cybersecurity-tackling-the-most-relevant-ai-risk/index.html'}
{'file_name': 'omniscien.com/blog/breaking-language-barriers-39-world-leading-language-learning-organizations-using-technology-to-make-a-difference/index.html'}
{'file_name': 'omniscien.com/blog/hype-cycle-for-ai-technologies-in-business/index.html'}
{'file_name': 'omniscien.co

In [7]:
documents[0].text

'\n\n[![](https://omniscien.com/wp-content/uploads/2020/10/AW_Omniscien-\nLogo_RGB_Hi-01-croppedmin.png)![](https://omniscien.com/wp-\ncontent/uploads/2020/10/AW_Omniscien-\nLogo_RGB_Hi-01-croppedmin.png)![](https://omniscien.com/wp-\ncontent/uploads/2020/07/Logo_Web.jpg)](https://omniscien.com/)\n\n__\n\n  * [ Home](https://omniscien.com/)\n  * [Products __](/products/)\n\n    * [![Language Studio Logo](https://omniscien.com/wp-content/uploads/2020/10/LanguageStudio240min.png)](/lsev6/)\n\nPrivate and Secure Artificial Intelligence Tools for Enterprise\n\n[Overview](/lsev6/) | [Features](/lsev6/features/)\n\nEditions\n\n![](https://omniscien.com/wp-content/uploads/2020/08/SecureCloud45.png)\n\nSecure Cloud\n\n![](https://omniscien.com/wp-content/uploads/2020/08/Servers45.png)\n\nEnterprise\n\n[![Media Studio Logo](https://omniscien.com/wp-\ncontent/uploads/2020/10/MediaStudio240min.png)](/products/media-studio/)\n\nProject Management,  \nEditing & Subtitle Data Processing\n\n[Overview

In [9]:
documents[1].text

'\n\n[![](https://omniscien.com/wp-content/uploads/2020/10/AW_Omniscien-\nLogo_RGB_Hi-01-croppedmin.png)![](https://omniscien.com/wp-\ncontent/uploads/2020/10/AW_Omniscien-\nLogo_RGB_Hi-01-croppedmin.png)![](https://omniscien.com/wp-\ncontent/uploads/2020/07/Logo_Web.jpg)](https://omniscien.com/)\n\n__\n\n  * [ Home](https://omniscien.com/)\n  * [Products __](/products/)\n\n    * [![Language Studio Logo](https://omniscien.com/wp-content/uploads/2020/10/LanguageStudio240min.png)](/lsev6/)\n\nPrivate and Secure Artificial Intelligence Tools for Enterprise\n\n[Overview](/lsev6/) | [Features](/lsev6/features/)\n\nEditions\n\n![](https://omniscien.com/wp-content/uploads/2020/08/SecureCloud45.png)\n\nSecure Cloud\n\n![](https://omniscien.com/wp-content/uploads/2020/08/Servers45.png)\n\nEnterprise\n\n[![Media Studio Logo](https://omniscien.com/wp-\ncontent/uploads/2020/10/MediaStudio240min.png)](/products/media-studio/)\n\nProject Management,  \nEditing & Subtitle Data Processing\n\n[Overview

In [10]:
documents[10].text

'\n\n[![](https://omniscien.com/wp-content/uploads/2020/10/AW_Omniscien-\nLogo_RGB_Hi-01-croppedmin.png)![](https://omniscien.com/wp-\ncontent/uploads/2020/10/AW_Omniscien-\nLogo_RGB_Hi-01-croppedmin.png)![](https://omniscien.com/wp-\ncontent/uploads/2020/07/Logo_Web.jpg)](https://omniscien.com/)\n\n__\n\n  * [ Home](https://omniscien.com/)\n  * [Products __](/products/)\n\n    * [![Language Studio Logo](https://omniscien.com/wp-content/uploads/2020/10/LanguageStudio240min.png)](/lsev6/)\n\nPrivate and Secure Artificial Intelligence Tools for Enterprise\n\n[Overview](/lsev6/) | [Features](/lsev6/features/)\n\nEditions\n\n![](https://omniscien.com/wp-content/uploads/2020/08/SecureCloud45.png)\n\nSecure Cloud\n\n![](https://omniscien.com/wp-content/uploads/2020/08/Servers45.png)\n\nEnterprise\n\n[![Media Studio Logo](https://omniscien.com/wp-\ncontent/uploads/2020/10/MediaStudio240min.png)](/products/media-studio/)\n\nProject Management,  \nEditing & Subtitle Data Processing\n\n[Overview

In [20]:
documents[-1].text

'\n\n'