In [None]:
!pip3 install  -r requirements.txt

In [None]:
from dotenv import dotenv_values
import openai, os
import numpy as np
from numpy.linalg import norm

secrets= dotenv_values(".env")

In [None]:
os.environ['OPENAI_API_KEY'] = secrets['OPENAI_API_KEY']

# Indexing Data

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = '/Users/rayanaay/Desktop/projects/langchain/summarizing_project/mix_data/rust_essentials.pdf'

loader = PyPDFLoader(file_path=file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

data = loader.load_and_split(text_splitter=text_splitter)
data

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(show_progress_bar=True)

vector1 = embeddings.embed_query('what do you think about yourself?')

len(vector1)

length of the embedded vector is 1536

In [None]:
def get_cosine(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))
    
vector1 = embeddings.embed_query('Rustlang')
vector2 = embeddings.embed_query('C++')
cosine = get_cosine(vector1, vector2)
cosine

In [None]:
vector3 = embeddings.embed_query('kimora')
cosine = get_cosine(vector1, vector3)
cosine

Interesting, even if the word 3 is decorrelated from programming langage, the cosine similarity is still high, but relatively small compared to the word 2.

# FAISS (Vector Database)

FAISS is a library for efficient similarity search and clustering of dense vectors.

In [None]:
from langchain.vectorstores import FAISS

index = FAISS.from_documents(data, embeddings)

In [None]:
index.similarity_search_with_relevance_scores(
    "Concurrency"
)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

retriever = index.as_retriever()
retriever.search_kwargs['fetch_k'] = 20
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

llm = ChatOpenAI()

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    verbose=True
)

handler = StdOutCallbackHandler()

chain.run(
    'What is concurrency in Rust ?',
    callbacks=[handler]
)