In [1]:
# !pip install langchain faiss-cpu pdfplumber scikit-learn transformers sentence-transformers accelerate bitsandbytes
# !pip install "pydantic>=2.0"
# !pip install -U langchain-community


In [1]:
# !pip install numpy==1.24.4 scipy==1.10.1 scikit-learn==1.2.2


In [1]:
#%restart_python


In [1]:
import os
import pdfplumber


In [1]:
import requests

# URLs of the annual reports
pdf_urls = {
    "HPS_Annual_Report_2022.pdf": "https://bytesizecomposableaisa.blob.core.windows.net/gen-ai-assignment/HPS%20Annual%20Report%202022.pdf?sv=2025-01-05&st=2025-06-22T08%3A17%3A06Z&se=2025-09-20T08%3A17%3A06Z&sr=b&sp=r&sig=CfGcnbLmN6v%2BjIdvDtVl2guq4kSqcUn2QIkdeQ3Wir0%3D",
    "Hubble_Annual_Report.pdf": "https://bytesizecomposableaisa.blob.core.windows.net/gen-ai-assignment/Hubble%20Annual%20Report.pdf?sv=2025-01-05&st=2025-06-22T08%3A17%3A06Z&se=2025-09-20T08%3A17%3A06Z&sr=b&sp=r&sig=cQVYGjqGeIz2opXLGF%2FzYdCchZiQa%2FdqzdVVBES363o%3D",
    "Schneider_Electric_Annual_Report.pdf": "https://bytesizecomposableaisa.blob.core.windows.net/gen-ai-assignment/Schneider%20Electric%20Annual%20Report.pdf?sv=2025-01-05&st=2025-06-22T08%3A17%3A06Z&se=2025-09-20T08%3A17%3A06Z&sr=b&sp=r&sig=N6byFRCtk4TdzokyfCaBKuzGJEuPU8af8pLECXvu3IE%3D"
}

# Download and save each PDF
for filename, url in pdf_urls.items():
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Downloaded and saved: {filename}")
    else:
        print(f"Failed to download: {filename}")


In [1]:
# === Step 1: Load PDF Text ===
def load_pdf_text(file_path):
    all_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            all_text += page.extract_text() or ""
    return all_text

# === Step 2: Define PDF Files ===
pdf_files = {
    "HPS": "HPS_Annual_Report_2022.pdf",
    "Hubble": "Hubble_Annual_Report.pdf",
    "Schneider": "Schneider_Electric_Annual_Report.pdf"
}


In [1]:
hf_token = "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"


In [1]:
import os
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from huggingface_hub import login

# === STEP 1: Load PDF Text ===
def load_pdf_text(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# === STEP 2: Split Text into Chunks ===
def split_text(text, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

# === STEP 3: Convert to LangChain Documents ===
def prepare_documents(pdf_files):
    documents = []
    for company, file_path in pdf_files.items():
        print(f"📄 Processing {company}")
        text = load_pdf_text(file_path)
        chunks = split_text(text)
        docs = [Document(page_content=chunk, metadata={"company": company}) for chunk in chunks]
        documents.extend(docs)
    return documents

# === STEP 4: Load Embeddings ===
def load_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# === STEP 5: Load LLM (Optional Token) ===
def load_local_llm(model_name="tiiuae/falcon-7b-instruct", hf_token=None):
    if hf_token:
        login(hf_token)  # Authenticate if token is provided

    pipe = pipeline(
        "text-generation",
        model=model_name,
        tokenizer=model_name,
        max_new_tokens=512,
        temperature=0.3,
        top_p=0.95,
        do_sample=True,
        device=-1  # Use 0 for GPU
    )
    return HuggingFacePipeline(pipeline=pipe)

# === STEP 6: Build Vectorstore ===
def build_vectorstore(docs, persist_path="faiss_index"):
    embeddings = load_embeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local(persist_path)
    return vectorstore

# === STEP 7: Load QA Chain ===
def build_qa_chain(vectorstore, model_name="tiiuae/falcon-7b-instruct", hf_token=None):
    llm = load_local_llm(model_name=model_name, hf_token=hf_token)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# === MAIN ===
if __name__ == "__main__":
    pdf_files = {
        "HPS": "HPS_Annual_Report_2022.pdf",
        "Hubble": "Hubble_Annual_Report.pdf",
        "Schneider": "Schneider_Electric_Annual_Report.pdf"
    }

    # Path to persist vectorstore
    persist_path = "faiss_index"

    # Optional: Hugging Face token if using gated/private models
    hf_token = os.getenv("HF_TOKEN")  # or paste it directly for testing

    # Step 1–3: Load and chunk documents
    if not os.path.exists(persist_path):
        print("📚 Building vectorstore from documents...")
        documents = prepare_documents(pdf_files)
        vectorstore = build_vectorstore(documents, persist_path)
    else:
        print("📦 Loading existing vectorstore...")
        embeddings = load_embeddings()
        vectorstore = FAISS.load_local(persist_path, embeddings)

    # Step 7: QA Chain (replace model name if needed)
    qa_chain = build_qa_chain(vectorstore, model_name="tiiuae/falcon-7b-instruct", hf_token=hf_token)

    # Step 8: Ask questions
    questions = [
        "What was HPS's total revenue in 2022?",
        "What are the strategic priorities of Schneider Electric?",
        "Which company had the highest profitability?",
        "How does Hubble handle sustainability?"
    ]

    for q in questions:
        print(f"\n❓ Question: {q}")
        answer = qa_chain.run(q)
        print(f"💡 Answer: {answer}")

