# Create VectorDB

**TABLE OF CONTENTS**
1. Imports
2. Loading the documents
3. Loading the embedding
4. Create the vectorDBs
5. Save

## 1. Imports

In [1]:
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import numpy as np
import pickle
from transformers import AutoModel  # needed for the following workaround: https://github.com/langchain-ai/langchain/issues/6080

  from .autonotebook import tqdm as notebook_tqdm


## 2. Loading the Documents

In [2]:
# loading the document chunks of size 700

# with register
with open("../data/pickles/ed1_docs_with_register_700_200.pickle", 'rb') as f:
    docs_ed1_700_with_register = pickle.load(f)
    
# without register
with open("../data/pickles/ed1_docs_without_register_700_200.pickle", 'rb') as f:
    docs_ed1_700_without_register = pickle.load(f)
    

# loading the document chunks of size 1500

# with register
with open("../data/pickles/ed1_docs_with_register_1500_400.pickle", 'rb') as f:
    docs_ed1_1500_with_register = pickle.load(f)
    
# without register
with open("../data/pickles/ed1_docs_without_register_1500_400.pickle", 'rb') as f:
    docs_ed1_1500_without_register = pickle.load(f)

In [2]:
# EDIT cell added after hyperparameter selection, features the 12th edition
with open("../data/pickles/ed1_ed12_docs.pickle", 'rb') as f:
    docs_ed1_ed12 = pickle.load(f)

## 3. Load the embedding

In [3]:
from huggingface_hub import login
login(token = hf_logging_token)  # UPDATE LOGIN TOKEN FOR RUNNING THE NOTEBOOK

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/Onema/.cache/huggingface/token
Login successful


In [4]:
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True) 
model_name = "jinaai/jina-embeddings-v2-base-de"
model_kwargs = {'device': 'cpu'}
hf_jina = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs)

  return self.fget.__get__(instance, owner)()


## 4. Create the vector DBs

In [5]:
vectordb_jina = FAISS.from_documents(documents = docs_ed1_700_with_register, 
                                embedding = hf_jina)

In [6]:
# rename (shallow copy)
vectordb_700_with_reg = vectordb_jina

In [7]:
vectordb_1500_with_reg = FAISS.from_documents(documents = docs_ed1_1500_with_register, 
                                embedding = hf_jina)

In [11]:
vectordb_700_without_reg = FAISS.from_documents(documents = docs_ed1_700_without_register, 
                                embedding = hf_jina)

In [12]:
vectordb_1500_without_reg = FAISS.from_documents(documents = docs_ed1_1500_without_register, 
                                embedding = hf_jina)

In [5]:
# EDIT cell added after hyperparameter selection, features the 12th edition
vectordb_ed1_ed12 = FAISS.from_documents(documents = docs_ed1_ed12, 
                                embedding = hf_jina)

## 5. Save

In [6]:
dir_path = "../data/vectorDB"

In [14]:
vectordb_700_with_reg.save_local(dir_path+"/"+"faiss_vecDB_700_reg")
vectordb_1500_with_reg.save_local(dir_path+"/"+"faiss_vecDB_1500_reg")
vectordb_700_without_reg.save_local(dir_path+"/"+"faiss_vecDB_700_without_reg")
vectordb_1500_without_reg.save_local(dir_path+"/"+"faiss_vecDB_1500_without_reg")

In [7]:
# EDIT cell added after hyperparameter selection, features the 12th edition
vectordb_ed1_ed12.save_local(dir_path+"/"+"faiss_vecDB_ed1_ed12")