## Loading a PDF file

In [7]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Not supported format!')
        return None

    data = loader.load()
    return data

# Wikipedia Loader
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [None]:
# Test load pdf data
# documento = load_document('docs/CLT.pdf')
# documento[100].page_content

In [None]:
# Test load data from wikipedia
# data = load_from_wikipedia('GPT-4')
# print(data[0].page_content)

In [4]:
def chunk_data(data, chunk_size=1000):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [5]:
def embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Cost Embedding (USD): {total_tokens / 1000 * 0.0001:.6f}')

In [None]:
import os
project_dir = os.path.join(os.path.expanduser("~"), 
                           "OneDrive", 
                           "Project_Code", 
                           "Project-LinuxTips-LLM_2025")
file_path = os.path.join(project_dir,"docs", "l-2.pdf")    

data = load_document(file_path)
chunks = chunk_data(data)

In [None]:
len(data)                    # check number of pages
len(data[100].page_content)  # count how many characters

In [None]:
# Test print chunks
# print(chunks[100].page_content)
# print(len(chunks))

In [None]:
# Total cost to turn chunks into embeddings
embedding_cost(chunks)

## Sending data to Pinecone

In [8]:
def insert_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f'Index {index_name}')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name}')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
    return vector_store

In [9]:
def delete_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes...')
        for index in indexes:
            pinecone.delete_index(index)
    else:
        print(f'Deleting index {index_name}...')
        pinecone.delete_index(index_name)

In [None]:
# Test deleting an existing index
# delete_index()

In [None]:
index_name = 'linuxtips'
vector_store = insert_embeddings(index_name)

## Asking questions and getting answers

In [None]:
def get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

    answer = chain.run(q)
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history

In [None]:
q = 'when was the labour laws created?'
answer = get_answer(vector_store, q)
print(answer)

In [None]:
import time
i = 1
print('Type exit to finish.')
while True:
    q = input(f'Question: #{i}: ')
    i = i+1
    if q.lower() in ['exit']:
        print('Finishing...')
        time.sleep(2)
        break

    answer = get_answer(vector_store, q)
    print(f'\Answer: {answer}')
    print(f'\n {"-" * 50} \n')