In [1]:
from pathlib import Path
from langchain.vectorstores import FAISS
from dataset.parsing_data_utils import load_all_documents
import os
from tqdm import tqdm
import pinecone
from langchain.embeddings import GPT4AllEmbeddings
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv

load_dotenv()

raw_dataset_dir = Path("../../data/raw_dataset/html")
split_dataset_dir = Path('../../data/split_dataset/4')
embedded_dataset_dir = Path('../../data/embedded_dataset/faiss/1')

  from tqdm.autonotebook import tqdm


In [2]:
docs = load_all_documents(split_dataset_dir)

Processing RFCs: 9266file [00:27, 336.44file/s]


In [None]:
embeddings = GPT4AllEmbeddings()
global_db = FAISS.from_documents([docs[0]], embeddings)

for idx, doc in tqdm(enumerate(docs[1:]), desc="Embedding in progress", total=len(docs)):
    global_db.add_documents([doc])
    if not idx % 80000:
        global_db.save_local(embedded_dataset_dir / f'faiss_idx_ckpt_{idx}')

global_db.save_local(embedded_dataset_dir / f'faiss_idx')

Embedding in progress:   0%|          | 130/607285 [00:22<28:49:16,  5.85it/s]


In [4]:
contents = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs]

In [2]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

False

The following directories listed in your path were found to be non-existent: {WindowsPath('notebooks/dataset/embedding_data.ipynb')}
The following directories listed in your path were found to be non-existent: {WindowsPath('module'), WindowsPath('/matplotlib_inline.backend_inline')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/usr/local/cuda/lib64')}
DEBUG: Possible options found for libcudart.so: set()
CUDA SETUP: PyTorch settings found: CUDA_VERSION=118, Highest Compute Capability: 7.5.
CUDA SETUP: To manually override the PyTorch CUDA version please see:https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md
CUDA SETUP: Loading binary C:\Users\cubix\PycharmProjects\rag\venv39\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.so...
argument of type 'WindowsPath' is not iterable
CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected.
CUDA SETUP: Solution 


python -m bitsandbytes


  warn(msg)


RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):

        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [6]:
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment=os.environ.get('PINECONE_ENV')
)

index_name = 'test'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=384,
        metric='cosine'
    )

In [7]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [6]:
from tqdm import tqdm

batch_size = 32

for i in tqdm(range(0, len(contents), batch_size)):
    i_end = min(len(contents), i+batch_size)
    contents_batch = contents[i:i+batch_size]
    embeds = embed_model.embed_documents(contents_batch)
    metadatas_batch = metadatas[i:i+batch_size]
    # add to Pinecone
    # for embed, metadata in zip(embeds, metadatas_batch):
    #     # Add error handling here if needed
    #     try:
    #         # Send a single vector and metadata
    #         index.upsert(vectors=zip(embed, metadata))
    #     except Exception as e:
    #         # Handle the exception, e.g., print an error message
    #         print(f"Error occurred: {e}")

  0%|          | 0/18978 [00:00<?, ?it/s]


NameError: name 'embed_model' is not defined

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

KeyboardInterrupt: 