# Data preparation for developing vector database

In [1]:
!pip install langchain pypdf --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.9/277.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# Importing necessary dependencies

In [2]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
from transformers import pipeline
import torch
from langchain.document_loaders import PyPDFLoader
from tqdm import tqdm
import re

# Loading and cleaning papers

In [4]:
def clean_dataset(entire_text):
    text = entire_text.page_content
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.compile(r'(\w+)\s*-\s*(\w+)').sub(lambda match: match.group(1) + match.group(2), text)
    text = re.sub(r'^Investors’ Handbook\s*(?:\d+|[IVXLCDM]+)\s*', '', text, flags=re.IGNORECASE)
    entire_text.page_content = text
    return entire_text

In [5]:
%%time


# Loading the pdf of NEPSE booklet
pdf_name = "NEPSE_booklet.pdf"
loader = PyPDFLoader(pdf_name)
pages = loader.load_and_split()

CPU times: user 2.72 s, sys: 17.2 ms, total: 2.74 s
Wall time: 3.68 s


In [6]:
# these pages don't contain any relevant information

useful_pages = [3, 4, 5]
start_page = 12
end_page = -3

page_numbers = list(range(12, 125))
[page_numbers.append(i) for i in useful_pages];
page_numbers = sorted(page_numbers)
pages1 = [pages[i] for i in page_numbers]
pages1 = [clean_dataset(i.copy()) for i in tqdm(pages1, desc="Cleaning Pages")]

Cleaning Pages: 100%|██████████| 116/116 [00:00<00:00, 657.66it/s]


In [7]:
len(pages1)

116

# Splitting text

In [10]:
from langchain.text_splitter import CharacterTextSplitter

In [11]:
'''
This is very important. The two parameters, chunk_size and chunk_overlap, are extremely important in terms of retreival. 
There is no hard and fast rule for setting the values for these parameters but I found the below value okay for this dataset.

chunk_size -> The size of each chunk.(maximum number of character each chunk can contain).
chunk_overlap -> number of characters being overlapped over each chunk_size.
'''

text_splitter = CharacterTextSplitter(
    chunk_size = 1000, 
    chunk_overlap = 150,
    separator="."
)

docs = text_splitter.split_documents(pages1)



In [12]:
len(docs)

349

In [13]:
print(docs[0].page_content)

The Nepalese securities markets is being modernised due to some structural changes in the recent years. The fullfledged dematerialised transaction of securities, the introduction of ASBA, CASBA and Meroshare system in the primary market enabling the applicants from 77 districts to access the service through more than 2500 BFIs as service providers, branch expansion of merchant bankers and stockbrokers outside of Kathmandu valley and adoption of online trading system have made Nepalese securities markets technofriendly, investment friendly and countrywide resulting increased attraction of public towards the securities markets in recent days. Low level of participation of institutional investors in the markets, lack of diversified instruments and low level of understanding and awareness in securities markets continues to be a cause of concern


In [14]:
print(docs[1].page_content)

Investors and even students tend to use thumb rules or seek advice from friends, market intermediaries and relatives, which are often poor approximations compared to those that follow from a scientific analysis. They will tend to make bad choices, contribute insufficiently, begin saving late, stay away from modern finance, or fall prey to fraud or misselling of financial instruments. If they get bad advice, the outcomes will be poor, and they will lose faith in the market system. Taking note of aforementioned facts, Securities Board of Nepal (SEBON) has been focusing on the investor education and awareness building programme Editorial in securities markets and commodity derivatives markets in order to empower and protect the interest of general investors at large. As a continuation of this, World Investor Week (WIW) 2020 Organising Committee is pleased to bring out “Investors’ Handbook on Securities Markets and Commodity Derivatives Markets”


# Sentence transformer

In [15]:
!pip install sentence-transformers --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [16]:
from langchain.embeddings import HuggingFaceEmbeddings

<b> I have utilized an open-source sentence transformer from huggingface to generate embeddings.

In [2]:
# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
sentence_transformer = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-mpnet-base-v2',# 'sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={'device':'cuda'},
)

# FAISS Vector Database

FAISS, also known as Facebook AI Similarity Search, is a vector database that can be used to store embeddings and perform quick search to find similar embeddings or relevant documents.

I will utilize langchain to implement FAISS vector store.

In [6]:
!pip install faiss-gpu 

In [20]:
from langchain.vectorstores import FAISS

In [30]:
%%time

# creating and storing embeddings generated via sentence_transformer in the FAISS vector store
vector_db = FAISS.from_documents(docs, sentence_transformer)

CPU times: user 7.16 s, sys: 209 ms, total: 7.37 s
Wall time: 8.46 s


In [31]:
# query = 'Thank you for helping me out'

# searchDocs = vector_db.similarity_search(query, k = 1)

In [33]:
vector_db.save_local("vector_db_NEPSE_GPU")

In [34]:
docsearch = FAISS.load_local("vector_db_NEPSE_GPU", sentence_transformer)

In [36]:
!pip show faiss-gpu

Name: faiss-gpu
Version: 1.7.2
Summary: A library for efficient similarity search and clustering of dense vectors.
Home-page: https://github.com/kyamagu/faiss-wheels
Author: Kota Yamaguchi
Author-email: KotaYamaguchi1984@gmail.com
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 
