In [None]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
import tiktoken
from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone as LangChainPinecone
from tqdm.auto import tqdm
from uuid import uuid4
import re



In [None]:
## Get env variables
# Get the current working directory
current_directory = os.getcwd()
# Construct the path to the .env file in the parent directory
env_path = os.path.join(current_directory, '..', '.env')
# Load the environment variables from the .env file
load_dotenv(dotenv_path=env_path)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')


In [None]:
current_dir = os.path.abspath(os.getcwd())
relative_path = "../data/" # if a data folder is in the parent directory
filename = "your_data_here.pdf"
file_path = os.path.join(current_dir, relative_path, filename)
print(current_dir)
print(file_path)

In [None]:
def extract_page_data(file_path):
    # Initialize the PyMuPDFLoader
    loader = PyMuPDFLoader(file_path)
    # Load the documents from the specified file_path
    docs = loader.load()

    data = []
    for doc in docs:
        tmp_dict = {}
        tmp_dict['text'] = doc.page_content.replace('\n', ' ')
        tmp_dict['page'] = doc.metadata['page']
        tmp_dict['title'] = doc.metadata['title']
        data.append(tmp_dict)

    return data

In [None]:
data = extract_page_data(file_path)
for entry in data:
    print(f"Page: {entry['page']}")

    print(f"Title: {entry['title']}")
    # Split text by period or newline characters and print each sentence on a new line
    lines = entry['text'].replace(' ● ', '\n● ').split('. ')
    for line in lines:
        print(line)
    print("\n" + "-"*50 + "\n") 

In [None]:
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)
  
# tokenizer setup
tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer = tiktoken.get_encoding('cl100k_base')
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)



In [None]:
# embeddings setup using OpenAI
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [None]:
from pinecone.exceptions import PineconeApiException 

# Create an instance of the Pinecone class
pc = Pinecone(api_key=PINECONE_API_KEY)

# Prepare the index name
index_name = (
    data[0]['title']
    .lower()                              # Convert to lowercase
    .replace(' ', '-')                    # Replace spaces with hyphens
)

# Remove all characters except lowercase letters, numbers, and hyphens
index_name = re.sub(r'[^a-z0-9\-]', '', index_name)[:45]

# List current indexes to ensure the check is accurate
current_indexes = pc.list_indexes()

if index_name not in current_indexes:
    try:
        pc.create_index(
            name=index_name,
            dimension=1536, # 1536 dim of text-embedding-ada-002
            metric="cosine", # Replace with your model metric
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ) 
        )
    except PineconeApiException as e:
        if "ALREADY_EXISTS" in str(e):
            print(f"Index '{index_name}' already exists.")
        else:
            raise
else:
    print(f"Index '{index_name}' already exists.")

# Access the index using the Index class
index = pc.Index(index_name)


In [None]:
def process_data_in_batches(data, text_splitter, embed, index, batch_limit=100):
    texts = []
    metadatas = []

    for i, record in enumerate(tqdm(data)):
        # first get metadata fields for this record
        metadata = {
            'title': record['title'],
            'page': record['page'],
        }
        # now we create chunks from the record text
        record_texts = text_splitter.split_text(record['text'])
        # create individual metadata dicts for each chunk
        record_metadatas = [{
            "chunk": j, "text": text, **metadata
        } for j, text in enumerate(record_texts)]
        # append these to current batches
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)
        # if we have reached the batch_limit we can add texts
        if len(texts) >= batch_limit:
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = embed.embed_documents(texts)
            index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        
process_data_in_batches(data, text_splitter, embed, index, batch_limit=100)

In [None]:
# retrieving vectorstore from pinecone
text_field = "text"
# switch back to normal index for langchain
index = pc.Index(index_name)
vectorstore = LangChainPinecone(
    index=index,
    embedding=embed.embed_query,  # The function or object to generate embeddings
    text_key=text_field
)

In [None]:
# query retrieval
# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)


In [None]:
import pprint

question = "Write your question here"
response = qa(question)
answer=response['result']
pprint.pprint(answer)