In [13]:
# %%
# Updated: GPT-4o-powered RAG pipeline to extract financial data from pitch decks
import json
import os
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from openai import OpenAI

In [None]:
# %%
# === CONFIGURATION ===
FOLDER_PATH = "/Users/equilibrium/MMC/pitchdecks"  # Your local folder with PDFs
EMBED_MODEL = "all-MiniLM-L6-v2"
GPT_MODEL = "gpt-4o"

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY is not set.")
client = OpenAI(api_key=api_key)


In [4]:
# %%
# === EXTRACT TEXT FROM PDF ===
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return "\n".join(page.get_text() for page in doc)

def extract_all_texts_with_source(folder_path):
    all_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            content = extract_text_from_pdf(file_path)
            all_texts.append((filename, content))
    return all_texts

In [5]:
# %%
# === CHUNK TEXT ===
raw_docs = extract_all_texts_with_source(FOLDER_PATH)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks_with_source = []

for filename, text in raw_docs:
    chunks = splitter.split_text(text)
    for chunk in chunks:
        chunks_with_source.append({"source": filename, "content": chunk})


In [6]:
# %%
# === EMBEDDING AND INDEXING ===
embedder = SentenceTransformer(EMBED_MODEL)
texts = [chunk["content"] for chunk in chunks_with_source]
sources = [chunk["source"] for chunk in chunks_with_source]

if not texts:
    raise ValueError("No text chunks found. Please check your folder path or PDF parsing.")

embeddings = embedder.encode(texts, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
metadata = {i: {"source": sources[i], "content": texts[i]} for i in range(len(texts))}

In [None]:
# === Scorecard Criteria ===
scorecard_criteria = {
    "Strength of the Management Team": {
        "weight": 0.30, "max_score": 3,
        "subcriteria": [
            "Founders' experience and credibility",
            "Number of founders (preferably 2)",
            "Track record of previous ventures"
        ]
    },
    "Size of the Opportunity": {
        "weight": 0.25, "max_score": 3,
        "subcriteria": [
            "Market size",
            "Market segmentation",
            "Immediate addressable market"
        ]
    },
    "Product/Technology": {
        "weight": 0.15, "max_score": 3,
        "subcriteria": [
            "Innovation and uniqueness",
            "Technology viability",
            "Stage of development"
        ]
    },
    "Competitive Environment": {
        "weight": 0.10, "max_score": 3,
        "subcriteria": [
            "Market competition analysis",
            "Differentiation from competitors",
            "Competitive advantages"
        ]
    },
    "Marketing/Sales Channels/Partnerships": {
        "weight": 0.10, "max_score": 3,
        "subcriteria": [
            "Go-to-market strategy",
            "Existing partnerships",
            "Scalability of marketing"
        ]
    },
    "Need for Additional Investment": {
        "weight": 0.05, "max_score": 3,
        "subcriteria": [
            "Clear investment requirements",
            "Use of funds",
            "ROI projections"
        ]
    },
    "Other Factors": {
        "weight": 0.05, "max_score": 3,
        "subcriteria": [
            "Fit with investment thesis",
            "Cap table analysis",
            "Revenue model"
        ]
    }
}

# === Prompt Builder ===
def build_score_prompt(company, category, subcriteria, context):
    bullet = "\n".join(f"- {s}" for s in subcriteria)
    return f"""You are a VC analyst evaluating the pitch deck for {company}.

Category: {category}
Criteria:
{bullet}

Context:
\"\"\"
{context}
\"\"\"

Give a score from 0–3 based on the context.
Respond only in this JSON format:
{{
  "score": <integer>,
  "rationale": <short explanation based on evidence>
}}
"""

# === Query Top Chunks for a Company ===
def get_context_for_company(company_name, top_k=3):
    matches = [v["content"] for v in metadata.values() if company_name.lower() in v["source"].lower()]
    if not matches:
        print("⚠️ No filename matches, using embedding search")
        embed = embedder.encode([company_name], convert_to_numpy=True)
        D, I = index.search(np.array(embed), top_k)
        matches = [metadata[i]["content"] for i in I[0]]
    return "\n\n".join(matches[:top_k])

# === Score Evaluation ===
def evaluate_pitchdeck_scorecard(company_name, context):
    results = {}
    total = 0.0

    for cat, data in scorecard_criteria.items():
        prompt = build_score_prompt(company_name, cat, data["subcriteria"], context)

        response = client.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {"role": "system", "content": "You are a VC analyst scoring pitch decks."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=700  # safer buffer
        )

        raw_output = response.choices[0].message.content.strip()
        clean_output = extract_json_block(raw_output)
        print(f"\n Cleaned LLM Response for '{cat}':\n{clean_output}\n")

        try:
            parsed = json.loads(clean_output)
        except json.JSONDecodeError as e:
            print(" JSON decoding failed. Raw output was:\n", repr(raw_output))
            raise e

        weight = (parsed["score"] / data["max_score"]) * data["weight"]
        total += weight

        results[cat] = {
            "score": parsed["score"],
            "rationale": parsed["rationale"],
            "weighted_score": round(weight, 3)
        }

    results["total_weighted_score"] = round(total, 3)
    return results




📄 Using context for: Trulite


📩 Cleaned LLM Response for 'Strength of the Management Team':
{
  "score": 2,
  "rationale": "The management team consists of two co-founders, which is ideal, and includes a CEO and CTO with advanced titles, suggesting a strong academic or professional background. However, there is no specific information provided about their previous ventures or track record, which limits the ability to fully assess their experience and credibility."
}


📩 Cleaned LLM Response for 'Size of the Opportunity':
{
  "score": 2,
  "rationale": "The pitch deck highlights significant health disparities affecting various demographic groups and quantifies the economic impact, indicating a large market size. However, it lacks specific market segmentation and details on the immediate addressable market."
}


📩 Cleaned LLM Response for 'Product/Technology':
{
  "score": 1,
  "rationale": "The pitch deck highlights significant health equity issues in the US, which is a critical area 

In [20]:
import re

def extract_json_block(text):
    """Extract JSON from ```json ... ``` block or fallback to raw text."""
    match = re.search(r"```json\s*({.*?})\s*```", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    
    # Fallback: try to find first JSON-looking object
    match2 = re.search(r"{.*}", text, re.DOTALL)
    if match2:
        return match2.group(0).strip()

    return text.strip()

In [None]:
if __name__ == "__main__":
    print("📊 Pitch Deck Evaluator – Interactive Mode")
    print("Type a company name to evaluate its pitch deck, or type 'exit' to quit.\n")

    while True:
        company = input("🔍 Enter company name: ").strip()
        if company.lower() in {"exit", "quit"}:
            print("👋 Exiting.")
            break

        print(f"\n📄 Fetching context for: {company} ...")
        context = get_context_for_company(company, top_k=3)
        print(f"✅ Context retrieved.\n")

        print("🤖 Evaluating pitch deck...\n")
        try:
            score_result = evaluate_pitchdeck_scorecard(company, context)
        except Exception as e:
            print("❌ Error during evaluation:", str(e))
            continue

        print("\n📈 Evaluation Summary:")
        print(json.dumps(score_result, indent=2))

        print("\n📝 Feedback per category:")
        for category, details in score_result.items():
            if category == "total_weighted_score":
                continue
            print(f"\n📌 {category}")
            print(f"Score: {details['score']} / {scorecard_criteria[category]['max_score']}")
            print(f"Rationale: {details['rationale']}")

        print("\n🎯 Total Weighted Score:", score_result["total_weighted_score"])
        print("-" * 60 + "\n")
