In [None]:
!pip install PyMuPDF langchain faiss-cpu sentence-transformers

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF, faiss-cpu
Successfully installed PyMuPDF-1.24.13 faiss-cpu-1.9.0


In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

In [None]:
import fitz  # PyMuPDF
import langchain
from langchain.schema import Document  # Updated import for Document class
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import pandas as pd

# Function to load PDF and extract text
def extract_text_from_pdf(pdf_path):
    text_content = []
    with fitz.open(pdf_path) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf.load_page(page_num)
            text_content.append(page.get_text("text"))
    return "\n".join(text_content)

# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_text(text)

# Function to perform retrieval-augmented generation with LangChain
def analyze_pdf_for_investor(pdf_path):
    # Step 1: Extract text
    text = extract_text_from_pdf(pdf_path)

    # Step 2: Split text into chunks
    documents = split_text_into_chunks(text)

    # Step 3: Load the documents into LangChain Document format
    langchain_docs = [Document(page_content=chunk) for chunk in documents]

    # Step 4: Embed chunks and create FAISS vector store
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(langchain_docs, embeddings)

    # Step 5: Set up the Hugging Face model for RAG
    generation_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
    hf_llm = HuggingFacePipeline(pipeline=generation_pipeline)

    # Initialize RetrievalQA with Hugging Face model and FAISS retriever
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    qa_chain = RetrievalQA.from_chain_type(
        llm=hf_llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    # Step 6: Define investor-specific questions
    questions = [
    "What are the company's primary growth drivers for the upcoming fiscal year?",
    "How is the company positioned to capture market share within its industry?",
    "Are there any new markets or regions the company plans to enter?",
    "What are the key strategic changes the company has implemented recently?",
    "Are there any upcoming changes in business operations or management structure?",
    "How is the company planning to adapt to industry trends and regulatory changes?",
    "What financial guidance has been provided for revenue, profit, and margins?",
    "Are there any major investments or expenditures planned for next year?",
    "How does the company plan to manage costs and optimize margins moving forward?",
    "What are the biggest risks the company currently faces?",
    "Has the company highlighted any potential challenges that could impact earnings?",
    "What new products or services is the company planning to launch?",
    "Are there any significant innovations or R&D projects underway?",
    "How does the company view its competition, and what measures are in place to remain competitive?",
    "Are there any industry disruptions or competitors that could affect the company's market position?",
    "How is the company’s relationship with key customers and partners evolving?",
    "Are there any new partnerships or collaborations that could boost growth?",
    "What are the company’s long-term goals for the next 3-5 years?",
    "How is the company addressing environmental, social, and governance (ESG) concerns?",
    "Are there any specific events or catalysts anticipated that could impact next year’s performance?",
    "Has the company disclosed any mergers, acquisitions, or divestitures that might affect its future growth?",
    "How is the company managing its debt and liquidity?",
    "Are there any recent changes in the company’s capital structure or funding sources?"
]


    # Step 7: Run each question through the RAG pipeline
    insights = {}
    for question in questions:
        response = qa_chain({"query": question})
        insights[question] = response["result"]

    # Convert insights into a structured DataFrame
    df_insights = pd.DataFrame(list(insights.items()), columns=["Question", "Insight"])
    return df_insights

# Specify PDF path
pdf_path = '/content/SJSTranscriptCall.pdf'

# Run analysis and display insights
df_insights = analyze_pdf_for_investor(pdf_path)
print(df_insights)

# Optionally, save insights to CSV
df_insights.to_csv('sjs_transcript_insights.csv', index=False)


Token indices sequence length is longer than the specified maximum sequence length for this model (1005 > 512). Running this sequence through the model will result in indexing errors


                                             Question  \
0   What are the company's primary growth drivers ...   
1   How is the company positioned to capture marke...   
2   Are there any new markets or regions the compa...   
3   What are the key strategic changes the company...   
4   Are there any upcoming changes in business ope...   
5   How is the company planning to adapt to indust...   
6   What financial guidance has been provided for ...   
7   Are there any major investments or expenditure...   
8   How does the company plan to manage costs and ...   
9   What are the biggest risks the company current...   
10  Has the company highlighted any potential chal...   
11  What new products or services is the company p...   
12  Are there any significant innovations or R&D p...   
13  How does the company view its competition, and...   
14  Are there any industry disruptions or competit...   
15  How is the company’s relationship with key cus...   
16  Are there any new partnersh