<a href="https://colab.research.google.com/github/ObjectMatrix/google-colab-notebook/blob/main/embeddings_pdf_w_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#@title 🌿 Keys
import os
import json

drive_mount_path = '/content/drive'

# Check if the drive is already mounted
if not os.path.ismount(drive_mount_path):
    from google.colab import drive
    drive.mount(drive_mount_path)
else:
    print("Drive is already mounted.")
with open('/content/drive/MyDrive/secrets.json', 'r') as f:
  secrets = json.load(f)
KEY=secrets['SECRET_KEY']
os.environ["OPENAI_API_KEY"] = KEY

Drive is already mounted.


In [4]:
# Requirement
!pip install openai -q
!pip install langchain -q
!pip install chromadb -q
!pip install tiktoken -q
!pip install pypdf -q
!pip install unstructured[local-inference] -q
!pip install gradio -q
!pip install fastcore -U

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0,model_name="gpt-4")

# Data Ingestion
from langchain.document_loaders import DirectoryLoader
pdf_loader = DirectoryLoader('/content/drive/MyDrive/pdfs/', glob="**/*.pdf")
excel_loader = DirectoryLoader('/content/drive/MyDrive/pdfs/', glob="**/*.txt")
word_loader = DirectoryLoader('/content/drive/MyDrive/pdfs/', glob="**/*.docx")
loaders = [pdf_loader, excel_loader, word_loader]
documents = []
for loader in loaders:
    documents.extend(loader.load())

# Chunk and Embeddings
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)

# Initialise Langchain - Conversation Retrieval Chain
qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0), vectorstore.as_retriever())


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




In [7]:
chat_history = []
user_message = "get the probability the attacker could still catch up now,"
response = qa({"question": user_message, "chat_history": chat_history})
print(response)

{'question': 'get the probability the attacker could still catch up now,', 'chat_history': [], 'answer': 'I would need to know the values of q and z to calculate the probability using the formula provided.'}


In [None]:

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

# 🌿 exploring some idea with **Towhee**

In [None]:
!pip install towhee

In [None]:
from towhee import AutoPipes

In [None]:
p = AutoPipes.pipeline('sentence_embedding')
output = p('Hello World.').get()
print(output)

In [None]:
from towhee import pipe, ops

img_embedding = (
    pipe.input('url')
        .map('url', 'img', ops.image_decode.cv2())
        .map('img', 'embedding', ops.image_embedding.timm(model_name='resnet50'))
        .output('embedding')
)

url = 'https://github.com/towhee-io/towhee/raw/main/towhee_logo.png'
res = img_embedding(url).get()

In [None]:
print(res)

In [None]:
quit()