<a href="https://colab.research.google.com/github/SpandanaKalakonda/LLMS/blob/main/pdf_query_langchain_cassandra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install Neccessary Libraries
!pip install -q langchain_openai cassio datasets tiktoken langchain_community langchain PyPDF2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#Import necessary libraries
from langchain_openai import OpenAI, OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from datasets import load_dataset
from PyPDF2 import PdfReader
import cassio

In [None]:
#Initializing the secret keys
from google.colab import userdata
ASTRA_DB_APPLICATION_TOKEN = userdata.get('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = userdata.get('ASTRA_DB_ID')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [None]:
#Creating a Pdfreader object to read the PDF file
pdfreader = PdfReader("/content/[THE] A Course in Machine Learning (2013).pdf")

In [None]:
# Reading the contents of the pdf file
from typing_extensions import Concatenate
raw_text = ''
for i, page in enumerate(pdfreader.pages):
  text = page.extract_text()
  if text:
    raw_text += text

In [None]:
raw_text



In [None]:
#Initializing cassion using the ASTRA DB(cassandra) credentials
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [None]:
#Creating a LLM and embeddings
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY,
             model="gpt-3.5-turbo",
             temperature=0,
             max_tokens=256,
             top_p=1
             )
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
#CReating a vector store to store the pdf file
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_ml1",
    keyspace=None,
    session=None
)

In [None]:
# Creating small chuncks of the pdf
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
texts = text_splitter.split_text(raw_text)

In [None]:
texts[:50]

['A Course in\nMachine Learning\nHal Daumé IIICopyright © 2013 –2017 Hal Daumé III\nSelf-published\nhttp://ciml.info/\nTODO. . . .\nSecond printing, January 2017For my students and teachers.\nOften the same.TABLE OF CONTENTS\nAbout this Book 6\n1 Decision Trees 8\n2 Limits of Learning 19\n3 Geometry and Nearest Neighbors 29\n4 The Perceptron 41\n5 Practical Issues 55\n6 Beyond Binary Classification 73\n7 Linear Models 87\n8 Bias and Fairness 104\n9 Probabilistic Modeling 116\n10 Neural Networks 1295\n11 Kernel Methods 141\n12 Learning Theory 154\n13 Ensemble Methods 164\n14 Efficient Learning 171\n15 Unsupervised Learning 178\n16 Expectation Maximization 186\n17 Structured Prediction 195\n18 Imitation Learning 212\nCode and Datasets 222\nBibliography 223\nIndex 225ABOUT THIS BOOK\nMachine learning is a broad and fascinating field . Even\ntoday, machine learning technology runs a substantial part of your\nlife, often without you knowing it. Any plausible approach to artiﬁ-',
 'Machine l

In [None]:
# Adding the embedded chuncks of data to the vector store
astra_vector_store.add_texts(texts)
print("Inserted %i chuncks." % len(texts))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 642 chuncks.


In [None]:
#Chatbot to fetch the responses from the pdf for a user query
first_question = True
while True:
  if first_question:
    query = input("Ask a question about the book (Or type 'quit' to exit):").strip()
  else:
    query = input("Ask a follow up question:").strip()
  if query.lower() == "quit":
    break
  if query == "":
    continue
  first_question = False
  print("\nQuestion: %s" % query)
  answer = astra_vector_index.query(query, llm=llm)
  print("\nAnswer: %s" % answer)

Ask a question about the book (Or type 'quit' to exit):machine learning

Question: machine learning





Answer: Machine learning is a broad and fascinating field that involves using algorithms and statistical models to enable computers to learn from and make predictions or decisions based on data without being explicitly programmed. It plays a significant role in various aspects of our lives, often without us realizing it. The field raises philosophical questions about learning and success in tasks, and it is essential for any approach to artificial intelligence.
Ask a follow up question:random forest

Question: random forest





Answer: Random forests are a type of ensemble learning method that consists of multiple decision trees. Each tree is built independently with random features, and the final prediction is made by aggregating the predictions of all the trees (voting). This method is efficient and effective, especially for classification and regression tasks.
Ask a follow up question:quit
