# Load Uber and Lyft 10K PDF files into a vector store
## Using LLama Index


>For Googla Colab Only

>>git clone https://github.com/OperationalizingAI/Hackathon-2-22-24.gi

In [None]:
%pip install llama_index llama-index-vector-stores-mongodb


In [None]:
import pymongo
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader

### Google Only Code

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

import os

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
!mkdir -p 'data/10k/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'

In [None]:
# Our variables

DB_NAME = 'rag1'
COLLECTION_NAME = '10k'
INDEX_NAME = 'idx_embedding'

In [None]:
mongodb_client = pymongo.MongoClient(MONGODB_ATLAS_CLUSTER_URI)

print ("Atlas client initialized")

Atlas client initialized


In [None]:
database = mongodb_client[DB_NAME]
collection = database [COLLECTION_NAME]

doc_count = collection.count_documents (filter = {})
print (f"Document count before delete : {doc_count:,}")

result = collection.delete_many(filter= {})
print (f"Deleted docs : {result.deleted_count}")

Document count before delete : 0
Deleted docs : 0


In [None]:
%%time

data_dir = 'data/10k/'

## This reads one doc
# docs = SimpleDirectoryReader(
#     input_files=["./data/10k/uber_2021.pdf"]
# ).load_data()

## here we read entire directory content
docs = SimpleDirectoryReader(
        input_dir=data_dir
).load_data()

print (f"Loaded {len(docs)} chunks from '{data_dir}'")

Loaded 307 chunks from 'data/10k/'
CPU times: user 14.5 s, sys: 72.4 ms, total: 14.6 s
Wall time: 15.5 s


In [None]:
vector_store = MongoDBAtlasVectorSearch(mongodb_client = mongodb_client,
                                 db_name = DB_NAME, collection_name = COLLECTION_NAME,
                                 index_name  = INDEX_NAME,
                                 ## the following columns are set to default values
                                 # embedding_key = 'embedding', text_key = 'text', metadata_= 'metadata',
                                 )
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)

In [None]:
from IPython.display import Markdown
from pprint import pprint

In [None]:
response = index.as_query_engine().query("What was Uber's revenue?")
print (response)
print()
# display(Markdown(f"<b>{response}</b>"))
pprint(response, indent=4)

In [None]:
%%time

response = index.as_query_engine().query("How much money did Lyft make in 2020?")
print (response)
print()
pprint(response, indent=4)

In [None]:
%%time

## The answer to this question doesn't exist in the Lyft_10k filing!
## Let's see what we get back
response = index.as_query_engine().query("What was Lyft's revenue for 2018?")
print (response)
print()
pprint(response, indent=4)

In [None]:
%%time

response = index.as_query_engine().query("When did Uber go IPO?")
print (response)
print()
pprint(response, indent=4)

In [None]:
%%time

response = index.as_query_engine().query("What were the Stock-based compensation for Lyft?")
print (response)
print()
pprint(response, indent=4)