In [1]:
import PyPDF2
import faiss
import pdfreader
from PyPDF2 import PdfReader
import pandas as pd

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS, VectorStore
import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from openai.embeddings_utils import get_embedding, cosine_similarity

In [2]:
doc = PdfReader(r"creditcard1.pdf")

In [3]:
raw_text = " "
for i, page in enumerate(doc.pages):
    text = page.extract_text()
    if text:
        raw_text +=text
        
print(len(raw_text))

18933


In [4]:
text_splitter = CharacterTextSplitter(
    separator= "\n\n",
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len)

texts = text_splitter.split_text(raw_text)

In [5]:
import re

def clean_text_format(s, sep_token = " \n"):
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r". ,", " ", s)
    s = s.replace(". .", ".")
    s = s.replace("..", ".")
    s = s.replace("\n", " ")
    s = s.strip()
    return s

texts = list(map(clean_text_format, texts))

In [6]:
with open("clean_text.txt", "w") as f:
    for line in texts:
        f.write(f"{line}\n")

In [7]:
query1 = "What is billing cycle"
query2 = "What is dual branding"
embeddings = OpenAIEmbeddings()

#### Sample embeddings

In [8]:
embed1 = get_embedding("risk based pricing", engine = "text-embedding-ada-002")
embed2 = get_embedding("risk ranking applicants", engine = "text-embedding-ada-002")
cosine_similarity(embed1, embed2)

0.8340266372173663

In [9]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

#### Embeddings from FAISS

In [None]:
db_faiss = FAISS.from_texts(texts, embeddings)

In [None]:
#### Simple similarity
faiss_q1 = db_faiss.similarity_search(query1)

In [None]:
print(faiss_q1[0].page_content)

In [None]:
similar_query1 = db_faiss.similarity_search_with_score(query1)
similar_query1

In [None]:
faiss_q2 = db_faiss.similarity_search(query2)
chain.run(input_documents = faiss_q2, question = query2)

#### Embeddings from Chromadb

In [19]:
import chromadb
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [68]:
loader = TextLoader("clean_text1.txt")
documents = loader.load()

#Split into chunks
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap=0)
docs1 = text_splitter.split_documents(documents)

In [69]:
embedding_function = SentenceTransformerEmbeddings(model_name= "all-MiniLM-L6-v2", )

<b2>Sentence Transformer Embedding Performance comparison

|Model|Score|Speed|
|:----:|:----:|:----:|
|multi-qa-MiniLM-L6-cos-v1	          | 64.33 | 14200|
|paraphrase-multilingual-mpnet-base-v2| 65.83 |2500  |
|paraphrase-albert-small-v2	          |64.46|5000    |
|paraphrase-multilingual-MiniLM-L12-v2|	64.25 |7500  |

In [22]:
db_chroma = Chroma.from_documents(docs1, embedding_function)

In [23]:
chroma_q1 = db_chroma.similarity_search(query1)
print(chroma_q1[0])

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


page_content='Risk Management Examination Manual of Credit Card Activities Chapter II II. CREDIT CARDS – GENERAL OVERVIEW WHAT IS A CREDIT CARD In its non-physical form, a credit card repres ents a payment mechanism which facilitates both consumer and commercial business transactions, including purchases and cash advances . A credit card generally operates as a substitute for cash or a check and most often provides an unsecured revolving line of credit. The borrower is required to pay at least part of the card’s outstanding balance each billing cycl  depending on the terms as set forth in the cardholder agreement . As the debt reduces, the available credit increases for accounts in good standing. These complex financial arrangements have ever-shifting terms and prices. A charge card differs from a credit card in that the charge card must be paid in full each month. In physical form, a credit card traditionally is a thi  rectangular plastic card. The front of the card contains a series 

In [24]:
chroma_q2 = db_chroma.similarity_search(query2)
print(chroma_q2[0])

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


page_content='Risk Management Examination Manual of Credit Card Activities Chapter II II. CREDIT CARDS – GENERAL OVERVIEW WHAT IS A CREDIT CARD In its non-physical form, a credit card repres ents a payment mechanism which facilitates both consumer and commercial business transactions, including purchases and cash advances . A credit card generally operates as a substitute for cash or a check and most often provides an unsecured revolving line of credit. The borrower is required to pay at least part of the card’s outstanding balance each billing cycl  depending on the terms as set forth in the cardholder agreement . As the debt reduces, the available credit increases for accounts in good standing. These complex financial arrangements have ever-shifting terms and prices. A charge card differs from a credit card in that the charge card must be paid in full each month. In physical form, a credit card traditionally is a thi  rectangular plastic card. The front of the card contains a series 

In [25]:
chain.run(input_documents = chroma_q2, question = query2)

" Dual branding is a process in which Visa and MasterCard banks also offer American Express and/or Discover cards. This was enabled by court rulings in 2004 that said the Associations' policies barring member banks from contracting with American Express and Discover violated antitrust rules."

In [None]:
query_custom = "What are the benefits of risk-based pricing"
db_chroma1 = Chroma.from_documents(documents=docs1, embedding=embeddings, collection_metadata={"hnsw:space": "ip"})
chroma_cos = db_chroma1.similarity_search(query_custom)

In [None]:
try:
    docsearch = Chroma.from_documents(documents=docs1, embedding=embedding_function)
except InvalidDimensionException:
    Chroma().delete_collection()
    docsearch = Chroma.from_documents(documents=docs1, embedding=embedding_function)

#### Pinecone

##### OpenAI API key + Pinecone API
Certificate > Add certificate in Windows

python -m pip install --upgrade certifi

proxy setup check


In [58]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

In [28]:
# pinecone.init(api_key="*****", environment="***")

In [80]:
index_name = "firstdemo"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name = index_name, metric= "euclidean", dimension = 1536)

In [81]:
db_pinecone = Pinecone.from_documents(docs1, embeddings, index_name = "firstdemo")
# pc_q2 = db_pinecone.similarity_search(query2)
# chain.run(input_documents = pc_q2, question = query2)

#### Max Marginal relevance
**Marginal relevance** = relevance of the document to the query - similarity of the document to the documents that have already been retrieved

**Relevance of doc to query** = similarity metric. E.g., Cosine similarity

**Similarity of a document to the documents that have already been retrieved** = Distance metric. E.g., Euclidean distance

Denoted by **λ** Ranges between 0 and 1: 
λ close to 1 Emphasis on relevance, λ close to 0 emphasis on diversity.

#### Pros & cons
Pros:
Trade off between  relevance and diversity.
Relatively simple to implement
Proven effective in a variety of retrieval tasks

Cons:
Computationally expensive to calculate the marginal relevance of all documents
Sensitive to the choice of the similarity metric and the distance metric.

In [82]:
### Search for MMR
"""
    k = No of docs to return, defaults 4
    fetch_k = No of docs to be passed for MMR, defaults 20
"""

mmr_docs = db_pinecone.max_marginal_relevance_search(query2, k=2, fetch_k=10) 
for i, doc in enumerate(mmr_docs):
    print(f"{i + 1}.", doc.page_content, "\n")

In [83]:
query_mmr = "What is biometric format"
chain.run(input_documents = mmr_docs, question = query_mmr) ###Max Marginal Relevance

'\n\nBiometric format is a type of authentication system that uses physical characteristics, such as fingerprints, eye scans, or facial recognition, to verify the identity of a user.'

In [84]:
pc_q2 = db_pinecone.similarity_search(query_mmr)
chain.run(input_documents = pc_q2, question = query_mmr) ###Cosine similarity

" Biometric format is a type of identification technique that relies on a cardholder's physical or biological features, such as fingerprints, iris scans, or voice scans."

### Appendix

#### Cosine similarity

In [None]:
import math

def euclidean_distance(x1, y1, x2, y2):
    return math.sqrt(((x2 - x1) ** 2) - ((y2 - y1) ** 2))

euclidean_distance(5,4,30,25)

#### Euclidean distance

In [None]:
def cosine_similarity(vector1, vector2):
    sum_of_products = 0
    magnitude1 = 0
    magnitude2 = 0

    for i in range(len(vector1)):
        sum_of_products += vector1[i] * vector2[i]
        magnitude1 += vector1[i] ** 2
        magnitude2 += vector2[i] ** 2

    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    return sum_of_products / (math.sqrt(magnitude1) * math.sqrt(magnitude2))

In [None]:
u = [1, 2, 4]
v = [2, 3, 20]
cosine_similarity(u, v)