In [1]:
!pip install -q cassio datasets langchain openai tiktoken

In [1]:
# Langchain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

In [4]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.8 (from langchain-community)
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain-community)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

In [2]:
# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
from PyPDF2 import PdfReader

In [5]:
ASTRA_DB_APPLICATION_TOKEN = ""
ASTRA_DB_ID = ""
OPENAI_API_KEY = ""

In [6]:
pdfreader = PdfReader("SF_Guide.pdf")

In [7]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
  content = page.extract_text()
  if content:
    raw_text += content

In [8]:
raw_text

'SAN FRANCISCO GUIDE\nSAN FRANCISCO GUIDE\n2\nMoney 4\nCommunication 5\nHolidays 6\nTransportation 7\nFood 9\nEvents During The Year 10\nThings to do 11\nDOs and DO NOTs 12\nActivities 17\n.\nEmergency Contacts\n911: emergencynumber\nSan Francisco Police Department: +1 415\n5538090\nEssential Information\nSAN FRANCISCO GUIDE\n3\nThere are many reasons why tourists from all\novertheworldvisitSanFrancisco. Itsbeautiful\nVictorianarchitectureandproximitytothePa-\ncificOceanarejustacoupleofthem. Thecityis\ndividedintoelevendistricts,eachofthemwith\ntheirownuniqueatmosphere, andeverybody\nwillfindtheirfavoriteneighborhood!\nSanFranciscoisanidealplacetovisitanytime;\nitsmildwintersandwarmdrysummersattract\ntourists throughout the year. There is a lot of\ntosee –TheGoldenGateisprobablythemost\nfamousbridgeintheworldandbelongstothe\nSeven Wonders of the Modern World. Fisher-\nman’s Wharf is an admirable seaside prome-\nnade with restaurants and shops. And if you\nwant to see it all at once, cl

In [9]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [10]:
# Create the Langchain Embedding And LLM objects for later usage
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)
  embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [None]:
# Create Langchain Vector Store Backed by Astra DB
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split text using Character Text Split such that it should not increase token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
# Load the dataset into the vector store
astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

In [None]:
first_question = True
while True:
  if first_question:
    query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
  else:
    query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

  if query_text.lower() == "quit":
    break

  if query_text == "":
    continue

  first_question = False

  print("\nQUESTION: \"%s\"" % query_text)
  answer = astra_vector_index.query(query_text, llm=llm).strip()
  print("\nANSWER: \"%s\"" % answer)

  print("FIRST DOCUMENTS BY RELEVANCE:")
  for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
    print("....[%0.4f] \"%s ...\""% (score, doc.page_content[:84]))
