In [90]:
import openai
import langchain

import os
from dotenv import load_dotenv
from pinecone import Pinecone as PNCone
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.chains.question_answering.chain import load_qa_chain
from langchain.docstore.document import Document

In [41]:
load_dotenv()


os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")



In [19]:
file_loader = PyPDFDirectoryLoader("assets/")
documents = file_loader.load()


len(documents)

44

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
docs

[Document(metadata={'source': 'assets/final_year_project.pdf', 'page': 0, 'page_label': '1'}, page_content='SAVITRIBAI PHULE PUNE UNIVERSITY\nA PROJECT REPORT ON\nTrust Based Carbon Offsetting using IOT\nand Blockchain for a Decentralized carbon\nEconomy\nSUBMITTED TOWARDS THE PARTIAL FULFILLMENT OF THE\nREQUIREMENTS OF\nBACHELOR OF ENGINEERING (Computer\nEngineering)\nBY\nSwapnil Shinde Roll No: B5539\nAnkit Kokane Roll No: B5479\nShivraj Sakunde Roll No: B5527\nShruti Sharma Roll No: B5534\nUnder The Guidance of\nProf. S. P. Bendale\nDEPARTMENT OF COMPUTER ENGINEERING\nNBN Sinhgad School Of Engineering\n1'),
 Document(metadata={'source': 'assets/final_year_project.pdf', 'page': 1, 'page_label': '2'}, page_content='NBN SINHGAD SCHOOL OF ENGINEERING\nDEPARTMENT OF COMPUTER ENGINEERING\nCERTIFICATE\nThis is to certify that the Project Entitled\nTrust Based Carbon Offsetting using IOT and Blockchain for a\nDecentralized Carbon Economy\nSubmitted by\nSwapnil Shinde Roll No: B5539\nAnkit K

In [21]:
# Create Embeddings 
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x12a7d2540>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x12af557c0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [23]:
vectors = embeddings.embed_query("Hello, world!")
len(vectors)

1536

In [58]:
# Vector store db connection 
pc = PNCone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("langchainvector")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [65]:
for i in range(0, len(docs), 100):
    batch = docs[i:i+100]
    texts = [doc.page_content for doc in batch]
    metadatas = [doc.metadata for doc in batch]
    embeddings_batch = embeddings.embed_documents(texts)

    vectors = []

    for j, (text, metadata) in enumerate(zip(texts, metadatas)):

        processed_metadata = {
            "text": text,
            "page": str(metadata.get("page", "")),
            "source": str(metadata.get("source", ""))
        }
        
        vectors.append({
            "id": f"vec_{i+j}",
            "values": embeddings_batch[j],
            "metadata": processed_metadata
        })

    index.upsert(vectors=vectors)

In [94]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")

In [96]:
question = "Who are the writers of this paper?"
query_embedding = embeddings.embed_query(question)
matching_results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)
matching_results
docs = [
    Document(
        page_content=match['metadata']['text'],
        metadata={"score": match['score']}
    ) 
    for match in matching_results['matches']
]
chain.run(input_documents=docs, question=question)

'The writers of the paper titled "Trust Based Carbon Offsetting using IOT and Blockchain for a Decentralized Carbon Economy" are Swapnil Shinde, Ankit Kokane, Shivraj Sakunde, and Shruti Sharma.'