In [None]:
!pip install -q cassio datasets langchain tiktoken

: 

In [1]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import cassio
from dotenv import load_dotenv
load_dotenv()



True

In [3]:
documents = []
doc_path = "documents"
for file in os.listdir(doc_path):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(doc_path, file))
        docs = loader.load()
        documents.extend(docs)
splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100)

In [4]:
documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-04-13T19:02:10+05:30', 'author': 'Rajeev Goel', 'moddate': '2025-04-13T19:02:10+05:30', 'source': 'documents\\Detail_file 1.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="I'm a passionate and driven individual with a deep interest in the fields of Artificial Intelligence, \nMachine Learning, and Full Stack Web Development. My journey so far has been an enriching blend \nof hands-on learning, impactful internships, leadership roles, and community involvement. \n       Projects & Technical Experience \n• VAE + 1D-CNN for Tool Condition Monitoring (TIH-IoT, IIT Bombay): Designed a deep \nlearning pipeline for detecting tool breakage using a combination of Variational \nAutoencoders and 1D CNNs. Developed an intuitive interface for real-time monitoring and \nanalysis. Worked extensively with PyTorch, MQTT, and MERN stack. \n• NoQs Digital (Data Analyst Inte

In [5]:
DATABASE_TOKEN = os.getenv("DATABASE_TOKEN")
DATABASE_ID = os.getenv("DATABSE_ID")

In [6]:
cassio.init(token = DATABASE_TOKEN, database_id = DATABASE_ID)

In [7]:
llm = ChatGoogleGenerativeAI(
    model = 'gemini-2.0-flash-001',
    api_key= os.getenv('GOOGLE_API_KEY'),
    streaming = True,
)
embeddings = GoogleGenerativeAIEmbeddings(
    model = 'models/text-embedding-004',
    google_api_key= os.getenv('GOOGLE_API_KEY')
)

Unexpected argument 'streaming' provided to ChatGoogleGenerativeAI. Did you mean: 'disable_streaming'?
                streaming was transferred to model_kwargs.
                Please confirm that streaming is what you intended.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [42]:
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="test1",
    session=None,
    keyspace=None
)

In [43]:
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="test2",
    session=None,
    keyspace=None
)

In [40]:
prompt = ChatPromptTemplate.from_template(
        "Answer the following question based on the provided context.\n"
        "In the given context consider I as Rajeev Goel..\n"
        "<context>\n{context}\n</context>\n\n"
        "Question: {input}"
    )

In [44]:
astra_vector_store.add_documents(documents)
print("Inserted")
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted


In [45]:
query = "Who is Rajeev Goel?"
answer = astra_vector_index.query(query, llm=llm).strip()

for doc, score in astra_vector_store.similarity_search_with_score(query=query, k=4):
    print(score, doc.page_content[:10])

0.8330926680733981 Rajeev Goe
0.7771210647660665 A B C F I 
0.7765393890017585 Date: Janu
0.7711983014144657 Date: May 


In [46]:
answer

'Rajeev Goel is a B.Tech student in Data Science and Artificial Intelligence at the Indian Institute of Technology, Bhilai, expected to graduate in 2026. He has a CGPA of 9.12. He has experience as a Project Intern at TIH-IoT IIT Bombay (Chanakya), a Data Analyst Intern at NoQs Digital, and an Application Developer Intern at RecogX Bhilai.'

In [47]:
vector_store = Cassandra(
    embedding=embeddings,
    table_name="test2",
    session=None,
    keyspace=None
)

In [48]:
vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
query = "What is the name of the company?"
answer = vector_index.query(query, llm=llm).strip()
answer

'What company are you referring to? Rajeev Goel has worked at NoQs Digital, TIH-IoT IIT Bombay (Chanakya), and RecogX Bhilai. He has also contributed to the Polaris Website.'

In [49]:
!pip install --upgrade astrapy

Collecting astrapy
  Downloading astrapy-2.0.1-py3-none-any.whl.metadata (23 kB)
Collecting deprecation<2.2.0,>=2.1.0 (from astrapy)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pymongo>=3 (from astrapy)
  Downloading pymongo-4.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting uuid6>=2024.1.12 (from astrapy)
  Downloading uuid6-2024.7.10-py3-none-any.whl.metadata (8.6 kB)
Collecting h2<5,>=3 (from httpx[http2]<1,>=0.25.2->astrapy)
  Downloading h2-4.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo>=3->astrapy)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting hyperframe<7,>=6.1 (from h2<5,>=3->httpx[http2]<1,>=0.25.2->astrapy)
  Downloading hyperframe-6.1.0-py3-none-any.whl.metadata (4.3 kB)
Collecting hpack<5,>=4.1 (from h2<5,>=3->httpx[http2]<1,>=0.25.2->astrapy)
  Downloading hpack-4.1.0-py3-none-any.whl.metadata (4.6 kB)
Downloading astrap

In [50]:
from astrapy import DataAPIClient

# Initialize the client
client = DataAPIClient(os.getenv("DATABASE_TOKEN"))
db = client.get_database_by_api_endpoint(
  "https://17683a4b-c09d-4cab-9266-5f462c7fbecd-us-east1.apps.astra.datastax.com"
)

print(f"Connected to Astra DB: {db.list_collection_names()}")

Connected to Astra DB: []


In [51]:
!pip install cassandra-driver langchain langchain-community




In [8]:
import os
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT, ProtocolVersion
from cassandra.auth import PlainTextAuthProvider
import json

cloud_config= {
    'secure_connect_bundle': "secure-connect-dbtest.zip",
    'connect_timeout': 30
}
auth_provider=PlainTextAuthProvider("token", os.environ["DATABASE_TOKEN"])
profile = ExecutionProfile(request_timeout=30)
cluster = Cluster(
    cloud=cloud_config,
    auth_provider=auth_provider,
    execution_profiles={EXEC_PROFILE_DEFAULT: profile},
    protocol_version=ProtocolVersion.V4
)
session = cluster.connect()

In [9]:
vector_store = Cassandra(
    embedding=embeddings,
    table_name="test2",
    session=session,
    keyspace="default_keyspace"
)

In [10]:
vector_store.add_documents(documents)
print("Inserted")
vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)

Inserted


In [12]:
vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
query = "What company is Rajeev Goel working as internship?"
answer = vector_index.query(query, llm=llm).strip()
answer

'Based on the provided information, Rajeev Goel is currently working as a Gen AI Specialist Intern at Assurant and as a Contributor at Code4GovTech - DMP 2025. He is also a Project Intern at TIH-IoT IIT Bombay (Chanakya).'

In [None]:
from langchain.memory import ConversationSummaryMemory
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

retriever = vector_index.vectorstore.as_retriever()

# === Memory ===
memory = ConversationSummaryMemory(
    llm=llm,
    memory_key="chat_history",
    return_messages=True
)

prompt = ChatPromptTemplate.from_template(
        "You are a friendly and knowledgeable AI assistant designed to help users explore Rajeev Goel’s professional portfolio. Provide concise, accurate, and engaging answers based on the provided context documents, which include details about Rajeev’s projects, experiences, skills, and achievements. If the answer is not in the context, respond honestly and say you don't have that information. Always maintain a helpful and approachable tone. Refer to Rajeev in the third person (e.g., \"Rajeev has worked on...\").\n\n"
        "<context>\n{context}\n</context>\n\n"
        "You may also refer the memory provided by the user.\n\n"
        "Memory: {memory}\n\n"
        "Question: {input}"
    )

# Combine documents + retrieval
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, combine_docs_chain)

def ask_question(question: str) -> str:
    global memory
    response = chain.invoke({"input": question, "memory": memory.buffer})
    return response["answer"]

question = "What is Rajeev's favorite movie?"
answer = ask_question(question)
print("Answer:", answer)
memory.save_context({"input": question}, {"output": answer})

question = "Rajeev goel's favorite movie is \"rab ne bana di jodi\". Remember it."
answer = ask_question(question)
print("Answer:", answer)
memory.save_context({"input": question}, {"output": answer})

question = "What is Rajeev's favorite movie?"
answer = ask_question(question)
print("Answer:", answer)
memory.save_context({"input": question}, {"output": answer})

question = "Who is Rajeev's favorite actor?"
answer = ask_question(question)
print("Answer:", answer)

Answer: I'm designed to provide information based on Rajeev Goel's portfolio and experiences. His favorite movie is not mentioned in the provided documents.
Answer: Okay, I will remember that Rajeev Goel's favorite movie is "Rab Ne Bana Di Jodi". However, keep in mind that this information is not derived from his professional portfolio.
Answer: Rajeev's favorite movie is "Rab Ne Bana Di Jodi." However, please note that this information is not derived from his professional portfolio.
Answer: I don't have information about Rajeev's favorite actor in his portfolio.


In [33]:
memory.clear()

In [34]:
question = "What is Rajeev's favorite movie?"
answer = ask_question(question)
print("Answer:", answer)
memory.save_context({"input": question}, {"output": answer})

Answer: I am designed to provide information based on Rajeev Goel's professional portfolio. I do not have access to information about his favorite movie.


In [29]:
memory.buffer

"The human asks what Rajeev's favorite movie is, and the AI responds that it does not have that information."