In [1]:
import os
from dotenv import load_dotenv
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from langchain.chains import LLMChain

In [2]:
load_dotenv()
OPENAI_API_KEY=os.getenv('OpenAI_API')

In [4]:
#LoadPDF
file_path = r"C:\Users\Rajmohan\Desktop\POC-PDF-Summarizer\data\stats.pdf"
loader = PyPDFLoader(file_path)
data = loader.load()


In [5]:
#Intialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
)


In [6]:
# Prepare documents and embeddings
all_chunks = []
for page in data:
    chunks = text_splitter.split_text(page.page_content)
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "text": chunk,
            "page": page.metadata.get("page", 0),
            "chunk_index": i
        })

In [7]:
# Convert your chunks into Document objects
docs_for_qdrant = [
    Document(
        page_content=chunk["text"],
        metadata={
            "page": chunk["page"],
            "chunk_index": chunk["chunk_index"]
        }
    )
    for chunk in all_chunks
]

In [8]:

# OpenAI Embeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=OPENAI_API_KEY) 

  embeddings = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=OPENAI_API_KEY)


In [9]:
#Initialize qdrant
qdrant_client = QdrantClient(
    url="https://94dd3679-7d4c-4265-b988-ab8f5196d435.us-east-1-1.aws.cloud.qdrant.io:6333", 
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.el9KRTVSJyxUD8GfYogVX9aO2Hmi3aSSSZgOfk5S8m4",
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='pdf_summaries')]


In [10]:
# create collection
qdrant_client.recreate_collection(
    collection_name="pdf_summaries_OpenAI",
    vectors_config=VectorParams(
        size=1536,   # <-- must match OpenAI embedding size
        distance=Distance.COSINE
    )
)


  qdrant_client.recreate_collection(


True

In [11]:
# Insert documents into Qdrant

qdrant_store = Qdrant(
    client=qdrant_client,
    collection_name="pdf_summaries_OpenAI",
    embeddings=embeddings,
)

qdrant_store.add_documents(docs_for_qdrant)

  qdrant_store = Qdrant(


['16dadf7f5de94b03aef258ba151eb959',
 '094b5ad72f8542aa918b1f437bd9ad2c',
 '755d2340c5db41b2b568044d2c76b2b3',
 'b7d375c0a75f47c3a6ad01c5357f02ed',
 '88467306352648a395f7cda72131b358',
 '3f2f6bb73395441c803b064ccf9efcc9',
 'ba59a8c1c6694364988f4b70b06706e0',
 '400e7dbf6f004caab677298e7a795bd6',
 'ad75d80b4f454cba97e7bc3ef8760c08',
 '9d7df1bee0ee4d54986c6edb1f9ee348']

In [12]:
# Initialize OpenAI LLM

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo", 
    temperature=0,
    openai_api_key=OPENAI_API_KEY
)

  llm = ChatOpenAI(


In [13]:
#Define prompt template
prompt_template = """
You are a helpful AI assistant. Use the following context to answer the question concisely:

Context:
{context}

Question:
{question}

Answer in clear, structured language.
"""

PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)


In [14]:
# Define a chain for generation
generation_chain = LLMChain(
    llm=llm,
    prompt=PROMPT
)

  generation_chain = LLMChain(


In [15]:
# Retrieval 
retriever = qdrant_store.as_retriever(search_kwargs={"k": 3})
def generate_answer(question):
    # 1. Retrieve relevant documents
    docs = retriever.get_relevant_documents(question)
    context = "\n".join([doc.page_content for doc in docs])
    
    # 2. Generate answer using the LLM
    answer = generation_chain.run(context=context, question=question)
    return answer, docs

In [16]:
# Ask questions
question = "What is correlation in statistics?"
answer, docs = generate_answer(question)

print("Answer:\n", answer)


  docs = retriever.get_relevant_documents(question)
  answer = generation_chain.run(context=context, question=question)


Answer:
 Correlation in statistics is a measure that indicates the extent to which two variables are linearly related. It helps describe simple relationships between variables without implying causation. Correlation coefficients range from -1 to 1, with 0 indicating no linear relationship, 1 indicating a perfect positive linear relationship, and -1 indicating a perfect negative linear relationship. Correlation is important in statistics as it can be used to identify relationships between variables, whether they are positive or negative. This information is valuable for understanding the connections between different factors in a complex system. Overall, correlation is a powerful tool in statistics that can help make predictions, develop causal models, and improve decision-making in various fields.


In [17]:
for doc in docs:
    print(f"Page {doc.metadata['page']} - Chunk {doc.metadata['chunk_index']}")
    print(doc.page_content)
    print("------")

Page 0 - Chunk 0
ImportanceandtheuseofcorrelationinStatistics
Introduction
Correlationisastatistical measurethat expressestheextent towhichtwovariablesarelinearlyrelated. It isacommontool fordescribingsimplerelationshipswithout makingastatement about causeandeffect.Correlationcoefficientsrangefrom-1to1, withavalueof 0indicatingnolinearrelationshipbetweenthetwovariables, avalueof 1indicatingaperfect positivelinearrelationship, andavalueof -1indicatingaperfectnegativelinearrelationship.
------
Page 0 - Chunk 1
Correlationisimportantinstatisticsbecauseitcanbeusedto
1. Identifyrelationshipsbetweenvariables:Correlationcanbeusedtoidentifywhetherthereisarelationshipbetweentwovariables, andif so, whethertherelationshipispositiveornegative. Thisinformationcanbeuseful forunderstandingtherelationshipsbetweendifferent factorsinacomplexsystem.
------
Page 2 - Chunk 2
Overall,correlationisapowerfulstatisticaltoolthatcanbeusedtoidentifyrelationshipsbetweenvariables,makepredictions,anddevelopcausalmod

In [18]:
# Ask questions
question = "How to Identify relationships between variables"
answer, docs = generate_answer(question)
print(answer)

To identify relationships between variables, correlation analysis can be used. Correlation helps determine if there is a relationship between two variables and whether it is positive or negative. This information is valuable for understanding the connections between different factors in a system. Additionally, correlation analysis can be applied in various fields such as business, finance, medicine, and psychology to uncover relationships and make predictions based on the data.


In [19]:
# Ask questions
question = "What are the uses of correlation"
answer, docs = generate_answer(question)
print(answer)

The uses of correlation include identifying relationships between variables in fields such as business and finance. In business, correlation can help in making better decisions by understanding the relationships between variables like sales, advertising spending, and customer satisfaction. In finance, correlation can be used to identify relationships between financial assets like stocks, bonds, and commodities to build diversified portfolios that reduce risk. Overall, correlation is a valuable statistical tool for describing relationships between variables without implying causation.


In [20]:
# Ask questions
question = "In which fields correlation is used?"
answer, docs = generate_answer(question)
print(answer)

Correlation is used in a wide variety of fields, including business and finance. In business, correlation can be used to identify relationships between different variables such as sales, advertising spending, and customer satisfaction. This information can help make better business decisions, such as how to allocate marketing resources. In finance, correlation can be used to identify relationships between different financial assets like stocks, bonds, and commodities. This information can be used to build diversified portfolios that reduce risk.


In [21]:
# Ask questions
question = "How correlation is used in real world?"
answer, docs = generate_answer(question)
print(answer)

Correlation is used in the real world to identify relationships between different variables in various fields such as business and finance. For example, a marketing manager can use correlation to understand the relationship between advertising spending and sales, helping them make decisions on how to allocate resources effectively. Similarly, a financial analyst can use correlation to analyze the relationship between the returns of different stocks, aiding in the construction of diversified portfolios to reduce risk. Overall, correlation is a valuable statistical tool that helps professionals make informed decisions based on the relationships between variables in their respective fields.
