In [None]:
#MAIN MAIN MAIN MAIN
import requests
import json
import io
import time
import pandas as pd
from pypdf import PdfReader
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# "SAAS & ENTERPRISE TECH"
# "CONSUMER (D2C) & RETAIL"
# "MANUFACTURING & CHEMICALS"
# "PHARMA & LIFE SCIENCES"
# "LOGISTICS & SUPPLY CHAIN"

company = "KSolves"
# sector = "SAAS & ENTERPRISE TECH"
# path = f"/workspaces/codespaces-blank/sector_visuals/{sector}/image_1.jpg"
md = "Ksolves-OnePager.md"
# ==========================================
# 1. CONFIGURATION (Fill these in)
# ==========================================
COMPANY_NAME = company  # <--- CHANGE THIS INPUT
PERPLEXITY_API_KEY = "YOUR API KEY"
FIRECRAWL_API_KEY = "YOUR API KEY"

# For Google Sheets (Optional - see Step 4 below)
GOOGLE_CREDS_FILE = "google_creds.json" 
SHEET_NAME = "Hackathon_Citations"

# ==========================================
# 2. MODULE: SMART SEARCH (Perplexity)
# ==========================================
def get_smart_urls(company):
    print(f"[Phase 1] Searching for top sources for: {company}...")
    
    url = "https://api.perplexity.ai/chat/completions"
    payload = {
        "model": "sonar-pro",
        "messages": [
            {
                "role": "system", 
                "content": "You are a financial researcher. Return high-quality, diverse URLS."
            },
            {
                "role": "user", 
                "content": f"Conduct deep research on {company} to gather high-quality sources regarding its Business Model, Product/Service Portfolio, and current Market Sentiment. Ensure the sources used are diverse, covering official documentation, industry analysis, and recent financial news."
            }
        ]
    }
    headers = {
        "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        # Extract citations (URLs)
        urls = data.get("citations", [])
        
        # # Deduplicate and limit to 5
        # clean_urls = list(set(urls))[:5]
        print(f"Found {len(urls)} URLs.")
        return urls
    except Exception as e:
        print(f"Search failed: {e}")
        return []

# ==========================================
# 3. MODULE: PDF HANDLER (The "N8n Fixer")
# ==========================================
def scrape_pdf(url):
    print(f"Detected PDF. Downloading locally: {url}...")
    headers = {"User-Agent": "Mozilla/5.0"} # Pretend to be Chrome
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        f = io.BytesIO(response.content)
        reader = PdfReader(f)
        
        text = ""
        # Only read first 5 pages to save speed/tokens
        for i, page in enumerate(reader.pages):
            # if i >= 5: break
            text += page.extract_text() + "\n"
            
        return f"SOURCE (PDF): {url}\nCONTENT:\n{text}..."
    except Exception as e:
        print(f"PDF Error: {e}")
        return None

# ==========================================
# 4. MODULE: WEB SCRAPER (Firecrawl)
# ==========================================
def scrape_web(url):
    print(f"Firecrawling: {url}...")
    
    try:
        response = requests.post(
            "https://api.firecrawl.dev/v1/scrape",
            headers={"Authorization": f"Bearer {FIRECRAWL_API_KEY}"},
            json={
                "url": url,
                "formats": ["markdown"],
                "onlyMainContent": True,
                "timeout": 30000 # 30s timeout
            }
        )
        data = response.json()
        
        if data.get("success"):
            markdown = data["data"]["markdown"]
            return f"SOURCE (WEB): {url}\nCONTENT:\n{markdown}..."
        else:
            print(f"Firecrawl failed for {url}")
            return None
    except Exception as e:
        print(f"Scrape Error: {e}")
        return None

# ==========================================
# 5. MODULE: GOOGLE SHEETS LOGGER
# ==========================================
def log_to_sheets(citations):
    print(f"[Phase 3] Logging {len(citations)} rows to Google Sheets...")
    
    try:
        # Define scope
        scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
        creds = ServiceAccountCredentials.from_json_keyfile_name(GOOGLE_CREDS_FILE, scope)
        client = gspread.authorize(creds)
        
        # Open Sheet
        sheet = client.open(SHEET_NAME).sheet1
        
        # Prepare Rows
        rows = []
        for cite in citations:
            rows.append([COMPANY_NAME, cite['url'], cite['snippet']])
            
        # Append
        sheet.append_rows(rows)
        print("Successfully uploaded to Google Sheets.")
        
    except FileNotFoundError:
        print("'google_creds.json' not found. Saving to CSV instead.")
        save_to_csv(citations)
    except Exception as e:
        print(f"Sheets API Error: {e}. Saving to CSV instead.")
        save_to_csv(citations)

def save_to_csv(citations):
    df = pd.DataFrame(citations)
    filename = f"{COMPANY_NAME}_citations.csv"
    df.to_csv(filename, index=False)
    print(f"Saved locally to {filename}")

# ==========================================
# MAIN EXECUTION FLOW
# ==========================================
if __name__ == "__main__":
    
    # Step 1: Search
    target_urls = get_smart_urls(COMPANY_NAME)
    
    final_data_pack = []
    citation_log = []
    
    # Step 2: Iterate & Scrape
    print(f"[Phase 2] Starting Extraction for {len(target_urls)} links...")
    
    for link in target_urls:
        content = None
        
        # ROUTING LOGIC: Is it PDF or Web?
        if link.lower().endswith(".pdf"):
            content = scrape_pdf(link)
        else:
            content = scrape_web(link)
            
        # If extraction worked, add to our data pack
        if content:
            final_data_pack.append(content)
            citation_log.append({
                "company": COMPANY_NAME, 
                "url": link, 
                "snippet": content[len(link)+20 : len(link)+150].replace("\n", " ") # Quick preview
            })
            
    # Step 3: Save Raw Data for LLM (The "Fusion" Input)
    with open(f"input_data{company}.json", "w", encoding="utf-8") as f:
        json.dump({"company": COMPANY_NAME, "sources": final_data_pack}, f, indent=4)
    print(f"Saved full scraped text to 'input_data{company}.json'")
    
    # Step 4: Update Citations
    log_to_sheets(citation_log)
    
    print("\nWORKFLOW COMPLETE.")

import json

with open(f"input_data{company}.json", "r") as f:
    data = json.load(f)

print(len(data["sources"]))

with open(f"{md}","a", encoding="utf-8") as f:
    f.write("\n")
    f.write("\n")
    f.write("PUBLIC DATA STARTS HERE")
    f.write("\n")
    f.write("\n")
    for i in range(len(data["sources"])):
        f.write(data["sources"][i])

# layer2_advanced_agent.py added gemini plotting_visuals
import os
import json
import math
import datetime
from typing import Optional, List, Dict
from dataclasses import dataclass, field
from enum import Enum
from decimal import Decimal

from pydantic import BaseModel, Field, validator
from google import genai

# -----------------------------
# CONFIG
# -----------------------------
# Recommended: Use a Pro model for better reasoning capabilities
MODEL_NAME = os.environ.get("GENAI_MODEL", "gemini-2.5-flash")
API_KEY = "YOUR API KEY"

if not API_KEY:
    # Fallback for testing, but prefer env var
    # API_KEY = "YOUR_API_KEY_HERE"
    raise RuntimeError("Set GOOGLE_API_KEY environment variable before running.")

client = genai.Client(api_key=API_KEY)


# -----------------------------
# UTILITIES
# -----------------------------
def safe_number(x):
    """Coerce many string/number formats to float. Return None if impossible."""
    if x is None:
        return None
    if isinstance(x, (int, float, Decimal)):
        return float(x)
    s = str(x).strip().replace(",", "").replace("(", "-").replace(")", "")
    units = {"mn": 1e6, "m": 1e6, "b": 1e9, "bn": 1e9, "k": 1e3, "cr": 1e7, "crore": 1e7}
    try:
        parts = s.split()
        if len(parts) > 1 and parts[-1].lower() in units:
            return float(parts[0]) * units[parts[-1].lower()]
        suffix = s[-1].lower()
        if "cr" in s.lower():
             return float(s.lower().replace("cr", "").replace("ore", "")) * 1e7
        if suffix in units:
            return float(s[:-1]) * units[suffix]
        return float(s)
    except Exception:
        return None

# -----------------------------
# ENHANCED Pydantic Schemas
# -----------------------------
class Citation(BaseModel):
    claim: str
    source_document: str
    confidence: str  # High / Medium / Low


class StrategicAssessment(BaseModel):
    executive_synthesis: str = Field(description="A 3-4 sentence high-level summary of the opportunity, synthesizing the key strengths and overall appeal.")
    economic_moat: str = Field(description="Details on durability of competitive advantage (e.g., switching costs, network effects).")
    market_position: str
    management_quality: str = Field(description="Assessment of leadership capability, track record, and alignment with shareholders.")
    key_risks: List[str]
    growth_catalysts: List[str]
    upside_scenario: str = Field(description="What needs to go right for a multi-bagger return? (Second-order implications).")


class FinancialHealth(BaseModel):
    revenue_quality: str = Field(description="Analysis of recurring vs one-time, customer concentration, and stickiness.")
    margin_trajectory_analysis: str = Field(description="Detailed analysis of why margins are moving (operating leverage, pricing power, etc.).")
    working_cap_cycle: str
    capex_intensity: str
    solvency_risk: str = Field(description="Analysis of debt levels, coverage ratios, and ability to service debt.")
    revenue_cagr_3yr: Optional[str]
    ebitda_margin_avg: Optional[str]


class DerivedMetrics(BaseModel):
    revenue_cagr_3y: Optional[float]
    ebitda_margin_latest: Optional[float]
    gross_margin_latest: Optional[float]
    working_cap_days_latest: Optional[float]
    roic_estimate: Optional[float]


class ChartSpec(BaseModel):
    chart_type: str  # e.g., "bar", "line", "area"
    x_label: str
    y_label: str
    series_names: List[str]
    notes: Optional[str] = None


class SlideBlock(BaseModel):
    title: str
    bullets: List[str]
    visuals: List[str]  # image cues e.g., "factory R&D lab"
    speaker_notes: str = Field(description="Comprehensive paragraph explaining the slide's narrative to an investor, covering nuances not in the bullets.")
    chart_spec: Optional[ChartSpec] = None


# -----------------------------
# Sector-specific typed containers
# -----------------------------
class SaaSMetrics(BaseModel):
    arr_current: Optional[str]
    net_dollar_retention: Optional[str]
    logo_churn: Optional[str]
    magic_number: Optional[str]
    rule_of_40_score: Optional[str]
    cac_payback_months: Optional[str]
    acv_trend: Optional[str]


class ManufacturingMetrics(BaseModel):
    capacity_utilization_rate: Optional[str]
    order_book_visibility: Optional[str]
    asset_turnover: Optional[str]
    raw_material_volatility: Optional[str]
    environmental_compliance: Optional[str]
    top_5_client_concentration: Optional[str]


class D2CMetrics(BaseModel):
    cm1_cm2_margins: Optional[str]
    customer_cohort_retention: Optional[str]
    marketing_efficiency: Optional[str]
    omnichannel_mix: Optional[str]
    inventory_turnover: Optional[str]
    nps_sentiment: Optional[str]


class PharmaMetrics(BaseModel):
    clinical_pipeline_stage: Optional[str]
    patent_cliff_analysis: Optional[str]
    addressable_patient_population: Optional[str]
    regulatory_track_record: Optional[str]
    cdmo_contract_structure: Optional[str]


class LogisticsMetrics(BaseModel):
    yield_per_ton_km: Optional[str]
    fleet_ownership_mix: Optional[str]
    reverse_logistics_capability: Optional[str]
    warehousing_utilization: Optional[str]


class SectorType(str, Enum):
    MANUFACTURING = "Manufacturing & Chemicals"
    SAAS = "SaaS & Enterprise Tech"
    D2C = "Consumer (D2C) & Retail"
    PHARMA = "Pharma & Life Sciences"
    LOGISTICS = "Logistics & Supply Chain"


class EquityResearchReport(BaseModel):
    # meta
    sector: SectorType
    company_pseudonym: str
    anonymization_notes: Optional[str] = None

    # core narrative
    investment_thesis_summary: str = Field(description="A deep, argumentative paragraph on why to invest. Focus on the core variant perception.")
    strategic_assessment: StrategicAssessment
    financial_health: FinancialHealth

    # derived metrics
    derived_metrics: DerivedMetrics

    # sector blocks
    saas_metrics: Optional[SaaSMetrics] = None
    manufacturing_metrics: Optional[ManufacturingMetrics] = None
    d2c_metrics: Optional[D2CMetrics] = None
    pharma_metrics: Optional[PharmaMetrics] = None
    logistics_metrics: Optional[LogisticsMetrics] = None

    # slide-ready content
    slide_blocks: List[SlideBlock]

    # NEW FIELD FOR PLOTTING
    plotting_visuals: List[str] = Field(description="A list of text descriptions describing specific data trends found in the text that can be visualized as charts. Format: 'Metric X grew/declined from A to B between Year Y and Year Z'.")

    # citations
    citations: List[Citation]

    # confidence & audit
    overall_confidence: str
    audit_flags: List[str] = []


# -----------------------------
# SectorSpec (internal, for prompts)
# -----------------------------
@dataclass
class SectorSpec:
    name: SectorType
    required_metrics: List[str]
    derived_metrics: List[str]
    heuristics: List[str]
    slide_template: List[dict]


SECTOR_SPECS = {
    SectorType.SAAS: SectorSpec(
        name=SectorType.SAAS,
        required_metrics=["ARR", "NDR", "Logo churn", "CAC", "Gross margin"],
        derived_metrics=["Rule of 40", "Magic Number", "CAC payback"],
        heuristics=[
            "NDR > 100% indicates expansion revenue",
            "Rule of 40 = growth% + margin, target > 40",
            "CAC payback under 18 months is healthy"
        ],
        slide_template=[
            {"title": "Business Snapshot", "bullets": ["ARR, GTM, product mix"]},
            {"title": "Growth & Unit Economics", "bullets": ["ARR growth, NDR, CAC payback"]},
            {"title": "Investment Highlights", "bullets": ["Moat, risks, catalysts"]}
        ]
    ),
    SectorType.MANUFACTURING: SectorSpec(
        name=SectorType.MANUFACTURING,
        required_metrics=["Revenue", "EBITDA", "Capacity utilization", "Order book"],
        derived_metrics=["Asset turnover", "Working capital days"],
        heuristics=[
            "Capacity utilization > 80% indicates pricing power",
            "Order book coverage >6 months de-risks demand",
            "Asset turnover > 1.5x implies efficient capital use"
        ],
        slide_template=[
            {"title": "Profile & Infrastructure", "bullets": ["facilities, certifications"]},
            {"title": "Financial & Operational Scale", "bullets": ["revenue growth, margins"]},
            {"title": "Investment Highlights", "bullets": ["moat, client base, export mix"]}
        ]
    ),
    SectorType.D2C: SectorSpec(
        name=SectorType.D2C,
        required_metrics=["Repeat rate", "AOV", "CAC", "Gross margin"],
        derived_metrics=["LTV/CAC", "Unit economics"],
        heuristics=[
            "Repeat rate > 30% and LTV/CAC > 3x are healthy",
            "AOV and CAC tradeoff critical to scale"
        ],
        slide_template=[
            {"title": "Brand & Channels", "bullets": ["portfolio, marketplace presence"]},
            {"title": "Unit Economics", "bullets": ["repeat rate, LTV/CAC, AOV"]},
            {"title": "Highlights", "bullets": ["growth levers, whitespace"]}
        ]
    ),
    SectorType.PHARMA: SectorSpec(
        name=SectorType.PHARMA,
        required_metrics=["Pipeline by phase", "Patent cliff dates", "R&D spend"],
        derived_metrics=["Addressable population", "Peak sales potential"],
        heuristics=[
            "Phase III readouts materially de-risk revenue",
            "Patent cliffs within 3 years are negative",
            "R&D spend > 15% of revenue is typical for innovation-led firms"
        ],
        slide_template=[
            {"title": "R&D & Pipeline", "bullets": ["lead assets, timelines"]},
            {"title": "Regulatory & Financial", "bullets": ["R&D intensity, patent cliffs"]},
            {"title": "Highlights", "bullets": ["licensing, partnerships"]}
        ]
    ),
    SectorType.LOGISTICS: SectorSpec(
        name=SectorType.LOGISTICS,
        required_metrics=["Ton-km yield", "Fleet utilization", "Warehouse utilization"],
        derived_metrics=["Yield per ton-km", "Operating leverage"],
        heuristics=[
            "Owned fleet increases capex but lowers unit cost at scale",
            "Warehouse utilization > 85% indicates asset scarcity"
        ],
        slide_template=[
            {"title": "Operational Footprint", "bullets": ["fleet, warehouses"]},
            {"title": "Unit Economics", "bullets": ["yield/ton-km, utilization"]},
            {"title": "Highlights", "bullets": ["contract tenor, client concentration"]}
        ]
    )
}


# -----------------------------
# AdvancedAnalystAgent (core)
# -----------------------------
class AdvancedAnalystAgentV2:
    def __init__(self, genai_client: genai.Client, model: str = MODEL_NAME):
        self.client = genai_client
        self.model = model
        self.sector_specs = SECTOR_SPECS

    def detect_sector(self, markdown_content: str) -> SectorType:
        t = markdown_content.lower()
        if "arr" in t or "saas" in t or "subscription" in t:
            return SectorType.SAAS
        if "phase iii" in t or "clinical" in t or "patent" in t or "fda" in t:
            return SectorType.PHARMA
        if "aov" in t or "repeat rate" in t or "e-commerce" in t or "d2c" in t:
            return SectorType.D2C
        if "fleet" in t or "ton-km" in t or "logistics" in t:
            return SectorType.LOGISTICS
        if "plant" in t or "capacity utilization" in t or "factory" in t:
            return SectorType.MANUFACTURING
        # Default fallback
        return SectorType.MANUFACTURING

    def analyze_markdown(self, markdown_path: str, sector_override: Optional[SectorType] = None) -> EquityResearchReport:
        try:
            with open(markdown_path, "r", encoding="utf-8") as f:
                markdown_content = f.read()
        except FileNotFoundError:
            raise RuntimeError(f"Markdown file not found at: {markdown_path}")

        # Detect sector from markdown content
        sector = sector_override or self.detect_sector(markdown_content)
        spec = self.sector_specs[sector]

        # ENHANCED SYSTEM INSTRUCTION WITH PLOTTING DIRECTIVE
        system_instruction = f"""
You are a Senior Partner at a top-tier Private Equity firm (e.g., Blackstone, Sequoia).
Your task is to produce a deep, critical investment memo based on the provided raw markdown document about a company.

Target sector: {sector.value}

INSTRUCTIONS FOR DEPTH & QUALITY:
1. **Go Beyond Facts:** Do not just summarize. Analyze. If revenue grew, explain *why* (price vs volume) and if it is sustainable.
2. **Second-Order Thinking:** Connect disparate facts. Example: "High R&D spend (Fact A) combined with new patent filings (Fact B) suggests a defensive moat is forming."
3. **Critical & Skeptical:** Highlight inconsistencies. If margins are high but cash flow is low, flag it in the financial health analysis.
4. **Slide Narrative:** The 'bullets' must be punchy (no word limit, but keep them impactful), but the 'speaker_notes' must provide the full, nuanced argument for the investor to read.
5. **Pseudonym:** NEVER reveal the real company name. Create a realistic project codename.

**SPECIAL INSTRUCTION: DATA VISUALIZATION EXTRACTION**
Scan the text for any data that represents a trend, time-series, or comparison. Extract these into the 'plotting_visuals' field as descriptive strings.
Example formats:
- "Revenue grew linearly from 100M in 2020 to 150M in 2023."
- "EBITDA margins fluctuated: 10% (2021), 12% (2022), 8% (2023)."
- "Segment breakdown: Pharma (60%), Consumer (30%), Others (10%)."

SECTOR INTELLIGENCE ({spec.name.value}):
Apply these heuristics to your analysis:
{chr(10).join([f"- {h}" for h in spec.heuristics])}

OUTPUT RULES:
- Populate the EquityResearchReport JSON schema exactly.
- For every factual numeric claim in 'citations', identify the source from the markdown.
- Populate ONLY the relevant sector_metrics field (e.g., 'saas_metrics' if sector is SaaS) and leave others as None.
"""

        # Assemble the prompt
        prompt_body = {
            "markdown_document": markdown_content,
            "instructions": "Analyze the document. Prioritize insight over brevity. Fill all 'analysis' and 'notes' fields with detailed reasoning.",
            "target_schema_example_sector": spec.name.value,
        }
        
        contents = json.dumps({"system_instruction": system_instruction, "payload": prompt_body}, indent=2)

        # Call the model
        response = self.client.models.generate_content(
            model=self.model,
            contents=contents,
            config={
                "response_mime_type": "application/json",
                "response_schema": EquityResearchReport,
            },
        )

        # Validate and parse
        try:
            text_out = response.text
            report = EquityResearchReport.model_validate_json(text_out)
        except Exception as e:
            # Fallback or error handling for malformed JSON
            print(f"Error parsing model response: {e}")
            raise

        # Post-processing / Safeguards
        if not report.derived_metrics.revenue_cagr_3y:
             report.derived_metrics.revenue_cagr_3y = None

        if not report.slide_blocks:
            report.slide_blocks = [
                SlideBlock(title="Overview", bullets=["Key information from markdown."], visuals=[], speaker_notes="GenAI failed to produce specific slides."),
            ]

        if not report.citations:
            report.citations = [Citation(claim="Information from markdown document", source_document="markdown_input.md", confidence="Medium")]

        # Calculate overall confidence based on citations
        if report.citations:
            confidences = [c.confidence.lower() for c in report.citations]
            if all(c == "high" for c in confidences):
                report.overall_confidence = "High"
            elif any(c == "low" for c in confidences):
                report.overall_confidence = "Low"
            else:
                report.overall_confidence = "Medium"
        else:
            report.overall_confidence = "Low"
            report.audit_flags.append("No citations found")

        return report


# -----------------------------
# Example runner
# -----------------------------
if __name__ == "__main__":
    agent = AdvancedAnalystAgentV2(client, model=MODEL_NAME)

    # Example path - change this to your actual file
    markdown_file_path = f"/workspaces/codespaces-blank/PROJECT KELP/{md}"

    if os.path.exists(markdown_file_path):
        report = agent.analyze_markdown(markdown_file_path)
        print(report.model_dump_json(indent=2, ensure_ascii=False))
    else:
        print(f"File not found: {markdown_file_path}")
with open(f"layer2_withplot{company}.json","w",encoding="utf-8") as f:
    f.write(report.model_dump_json(indent=2, ensure_ascii=False))
from groq import Groq
import json
with open(f"layer2_withplot{company}.json","r",encoding="utf-8") as f:
    data = json.load(f)

# data["plotting_visuals"]

client = Groq(api_key="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")
prompt = f"""
**Role:**
You are a Senior Data Analyst and JSON Architect.

**Objective:**
I have a list of raw text strings containing financial data, market growth stats, and operational metrics. Your task is to:
1. **Analyze** each string to understand the data relationship (Time Series, Comparison, Part-to-Whole, or Single KPI).
2. **Classify** the appropriate chart type for visualization based on the rules below.
3. **Extract** the data into a structured JSON format.

**Input Data:**
{data["plotting_visuals"]}

**Chart Selection Rules (Heuristics):**
- **"line_chart"**: For multi-year trends (3+ data points). Example: Revenue from 2016-2024.
- **"column_chart"**: For comparing values across 2-3 specific periods. Example: Sales in 2024 vs 2025.
- **"bar_chart"**: For comparing different categories or market sizes. Example: Market A vs Market B size.
- **"kpi_card"**: For single distinct numbers or percentage jumps. Example: "Net profit rose 832%".

**Output Requirements:**
- Return **ONLY valid JSON**. No markdown formatting, no explanations.
- The root object should be a dictionary where keys are descriptive IDs (e.g., "revenue_trend", "market_growth").
- Each entry MUST have a `type` field (line_chart, column_chart, bar_chart, or kpi_card).
- **Structure for Charts (line/column/bar):** Must include `title`, `labels` (x-axis), and `datasets` (y-axis values).
- **Structure for KPIs:** Must include `label` and `value`.

**Target JSON Structure Example:**
{{
    "revenue_trend": {{
        "type": "line_chart",
        "title": "Revenue Growth (2016-2024)",
        "labels": ["2016", "2017", "2018", "2022", "2023", "2024"],
        "datasets": [
            {{
                "label": "Revenue (INR Cr)",
                "data": [3077.48, 2672.03, 2705.71, 4017.6, 4109.58, 5022.48]
            }}
        ]
    }},
    "market_snapshot": {{
        "type": "bar_chart",
        "title": "Nutraceutical Market Sizes (2024)",
        "labels": ["Plant Based", "Total Nutraceutical"],
        "datasets": [
            {{
                "label": "Market Size ($M)",
                "data": [100, 6110]
            }}
        ]
    }},
    "profit_jump_kpi": {{
        "type": "kpi_card",
        "data": [
            {{"label": "Q1 FY25 Net Profit Growth", "value": "832.98%"}}
        ]
    }}
}}

"""
groq_card = client.chat.completions.create(
    # 401629
    messages=[
        {
            "role": "user",
            "content": f"{prompt}",
        },
        {
            "role":"system",
            "content":"You are a psychologist. Output JSON"
        }
    ],
    model="openai/gpt-oss-120b",
    response_format={"type": "json_object"}
)

print(groq_card.choices[0].message.content)
    # return groq_card.choices[0].message.content
import json

# 1. Read the existing file
try:
    with open(f"layer2_withplot{company}.json", "r", encoding="utf-8") as f:
        existing_data = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
    existing_data = {}  # Start fresh if file doesn't exist or is empty

# 2. Get the new data (Parse string from Groq to a Dict)
new_data_str = groq_card.choices[0].message.content
try:
    new_data = json.loads(new_data_str)
except json.JSONDecodeError:
    # Handle case where Groq returns markdown code blocks
    import re
    clean_str = re.sub(r"^```json\s*|\s*```$", "", new_data_str.strip(), flags=re.MULTILINE)
    new_data = json.loads(clean_str)

# 3. Merge new data into existing dictionary
# existing_data.update(new_data)
existing_data["zplotting"] = new_data

# 4. Write back to file (Overwrites with the combined valid JSON)
with open(f"layer2_withplot{company}.json", "w", encoding="utf-8") as f:
    json.dump(existing_data, f, indent=4, ensure_ascii=False)
import json

def extract_slide_visuals(json_file_path):
    extracted_data = []

    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Check if 'slide_blocks' exists
        if "slide_blocks" in data:
            print(f"Found {len(data['slide_blocks'])} slide blocks. Scanning for visuals...")
            
            # Loop through each slide block
            for index, block in enumerate(data["slide_blocks"]):
                # Check if the 'visuals' key exists and is not empty
                if "visuals" in block and block["visuals"]:
                    print(f"  -> Found {len(block['visuals'])} items in Slide {index+1} ('{block.get('title', 'Untitled')}')")
                    
                    # Add all strings from this slide's visual list to our master list
                    extracted_data.extend(block["visuals"])
        else:
            print("No 'slide_blocks' found in the JSON.")

        return extracted_data

    except FileNotFoundError:
        print(f"Error: File '{json_file_path}' not found.")
        return []
    except json.JSONDecodeError:
        print(f"Error: Failed to parse JSON in '{json_file_path}'.")
        return []

# --- Run the Extraction ---
all_visual_strings = extract_slide_visuals(f"layer2_withplot{company}.json")

print("\n--- Extracted Visuals Content ---")
for item in all_visual_strings:
    print(f"• {item}")

import os
import json
from huggingface_hub import InferenceClient

# ===============================
# CONFIG
# ===============================
INPUT_JSON = f"layer2_withplot{company}.json"
OUTPUT_DIR = "sector_visuals"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"

client = InferenceClient(
    provider="nscale",
    api_key="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"  # move to env in prod
)


# ===============================
# LOAD SECTOR
# ===============================
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

sector = data["sector"].upper()


# ===============================
# PROMPT MAP (PHYSICAL IMAGES ONLY)
# ===============================


MANUFACTURING = "Manufacturing & Chemicals"
SAAS = "SaaS & Enterprise Tech"
D2C = "Consumer (D2C) & Retail"
PHARMA = "Pharma & Life Sciences"
LOGISTICS = "Logistics & Supply Chain"


SECTOR_PROMPTS = {
    "SAAS & ENTERPRISE TECH": [
        "Modern SaaS company office, developers collaborating, laptops, large digital screens, startup environment, realistic photography",
        "High-tech software workspace, cloud infrastructure theme, professionals working together, cinematic lighting"
    ],

    "MANUFACTURING & CHEMICALS": [
        "Large-scale manufacturing facility, heavy machinery, industrial workers wearing safety helmets, realistic industrial photography",
        "Automated production line inside a factory, metal components, sparks, cinematic realism"
    ],

    "CONSUMER (D2C) & RETAIL": [
        "Direct-to-consumer warehouse, branded packaging, workers preparing online orders, modern ecommerce logistics",
        "Lifestyle product shoot, premium consumer goods, clean studio lighting, modern brand aesthetic"
    ],

    "PHARMA & LIFE SCIENCES": [
        "Pharmaceutical manufacturing plant, sterile clean room, scientists in lab coats, advanced machinery, realistic lighting",
        "Medicine production laboratory, quality control testing, professional pharmaceutical environment"
    ],

    "LOGISTICS & SUPPLY CHAIN": [
        "Large logistics hub with cargo trucks and warehouses, sunrise lighting, realistic commercial photography",
        "Shipping port in India, cargo containers, cranes, ships in blue ocean, global trade atmosphere"
    ]
}


# ===============================
# IMAGE GENERATION
# ===============================
def generate_sector_images(sector: str):
    prompts = SECTOR_PROMPTS.get(sector)

    if not prompts:
        raise ValueError(f"Unknown sector: {sector}")

    sector_dir = os.path.join(OUTPUT_DIR, sector)
    os.makedirs(sector_dir, exist_ok=True)

    for idx, prompt in enumerate(prompts):
        print(f"Generating image {idx+1} for sector: {sector}")

        image = client.text_to_image(
            prompt=prompt,
            model=MODEL_ID
        )

        image_path = os.path.join(sector_dir, f"image_{idx+1}.jpg")
        image.save(image_path, format="JPEG", quality=95)

        print(f"Saved → {image_path}")



[Phase 1] Searching for top sources for: KSolves...
Found 10 URLs.
[Phase 2] Starting Extraction for 10 links...
Firecrawling: https://www.ksolves.com...
Firecrawling: https://www.ksolves.com/about-us-ksolves...
Firecrawling: https://www.zoominfo.com/c/ksolves-llc/368475708...
Firecrawling: https://www.ksolves.com/blog/ksolves/ksolves-in-2025-a-year-that-redefined-digital-transformation...
Firecrawling: https://www.ksolves.com/investors...
Firecrawling: https://www.ksolves.com/java-development-company...
Detected PDF. Downloading locally: https://www.ksolves.com/wp-content/uploads/2025/05/Ksolves-Investor-Presentation-FY-2024-25.pdf...
Firecrawling: https://www.ksolves.com/blog/ksolves/it-culture-in-indore-find-amazing-opportunities-at-every-corner...
Firecrawling: https://www.ksolves.com/request-for-proposal...
Firecrawling: https://store.ksolves.com/aboutus...
Saved full scraped text to 'input_dataKSolves.json'
[Phase 3] Logging 10 rows to Google Sheets...
'google_creds.json' not fou

In [44]:
generate_sector_images(sector)

Generating image 1 for sector: SAAS & ENTERPRISE TECH
Saved → sector_visuals/SAAS & ENTERPRISE TECH/image_1.jpg
Generating image 2 for sector: SAAS & ENTERPRISE TECH
Saved → sector_visuals/SAAS & ENTERPRISE TECH/image_2.jpg


In [41]:
company = "Connplex"
sector = "CONSUMER (D2C) & RETAIL"

In [45]:
import json
import os
import re
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.shapes import MSO_SHAPE
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
from pptx.chart.data import CategoryChartData
from pptx.enum.chart import XL_CHART_TYPE, XL_LEGEND_POSITION
import random

# =============================================================================
# 1. DESIGN SYSTEM (KELP BRANDING)
# =============================================================================
THEME = {
    "colors": {
        "primary": RGBColor(75, 0, 130),       # Dark Indigo
        "accent_pink": RGBColor(255, 20, 147), # Pink
        "accent_cyan": RGBColor(0, 255, 255),  # Cyan
        "dark_grey": RGBColor(64, 64, 64),     # Text
        "light_grey": RGBColor(245, 245, 245), # Backgrounds
        "white": RGBColor(255, 255, 255),
    },
    "fonts": {
        "header": "Arial",
        "body": "Arial"
    }
}

class KelpDynamicGenerator:
    def __init__(self, json_path, output_file=f"Kelp_Auto_Teaser{company}.pptx"):
        self.output_file = output_file
        
        # Load JSON Data
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                self.data = json.load(f)
        except FileNotFoundError:
            print(f"❌ Error: File '{json_path}' not found.")
            self.data = {}

        self.prs = Presentation()
        # Set to Widescreen (16:9)
        self.prs.slide_width = Inches(13.333)
        self.prs.slide_height = Inches(7.5)

    # -------------------------------------------------------------------------
    # LAYOUT & BRANDING HELPERS
    # -------------------------------------------------------------------------
    def _add_master_elements(self, slide, title_text):
        """Adds Logo, Title, Geometric Overlay, and Footer."""
        # 1. Logo
        box = slide.shapes.add_textbox(Inches(0.5), Inches(0.2), Inches(2.0), Inches(0.5))
        p = box.text_frame.paragraphs[0]
        p.text = "KELP"
        p.font.bold = True
        p.font.size = Pt(22)
        p.font.name = THEME['fonts']['header']
        p.font.color.rgb = THEME['colors']['primary']

        # 2. Title
        box = slide.shapes.add_textbox(Inches(0.5), Inches(0.65), Inches(10), Inches(0.8))
        p = box.text_frame.paragraphs[0]
        p.text = title_text.upper() if title_text else "INVESTMENT BRIEF"
        p.font.bold = True
        p.font.size = Pt(24)
        p.font.name = THEME['fonts']['header']
        p.font.color.rgb = THEME['colors']['primary']

        # 3. Accent Line
        shape = slide.shapes.add_shape(
            MSO_SHAPE.RECTANGLE, Inches(0), Inches(1.4), Inches(13.333), Inches(0.08)
        )
        shape.fill.solid()
        shape.fill.fore_color.rgb = THEME['colors']['accent_pink']
        shape.line.fill.background()

        # 4. Footer
        box = slide.shapes.add_textbox(
            Inches(0), Inches(7.1), Inches(13.333), Inches(0.4)
        )
        p = box.text_frame.paragraphs[0]
        p.text = "Strictly Private & Confidential – Prepared by Kelp M&A Team"
        p.font.size = Pt(9)
        p.alignment = PP_ALIGN.CENTER
        p.font.color.rgb = THEME['colors']['dark_grey']

    def _create_content_box(self, slide, x, y, w, h, title, bullets):
        """Creates a dense text block with a header."""
        # Header
        shape = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, x, y, w, Inches(0.35))
        shape.fill.solid()
        shape.fill.fore_color.rgb = THEME['colors']['primary']
        shape.line.fill.background()
        
        p = shape.text_frame.paragraphs[0]
        p.text = title
        p.font.bold = True
        p.font.size = Pt(11)
        p.font.color.rgb = THEME['colors']['white']

        # Content
        textbox = slide.shapes.add_textbox(x, y + Inches(0.35), w, h - Inches(0.35))
        tf = textbox.text_frame
        tf.word_wrap = True
        tf.margin_top = Pt(5)
        
        for item in bullets:
            p = tf.add_paragraph()
            p.text = f"• {item}"
            p.font.size = Pt(10)
            p.font.name = THEME['fonts']['body']
            p.font.color.rgb = THEME['colors']['dark_grey']
            p.space_after = Pt(4)

    def _place_image(self, slide, path, x, y, w, h):
        """Places image or placeholder if file missing."""
        if path and os.path.exists(path):
            try:
                slide.shapes.add_picture(path, x, y, w, h)
                # Border
                rect = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, x, y, w, h)
                rect.fill.background()
                rect.line.color.rgb = THEME['colors']['accent_cyan']
                rect.line.width = Pt(1.5)
                return
            except:
                pass
        
        # Placeholder
        shape = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, x, y, w, h)
        shape.fill.solid()
        shape.fill.fore_color.rgb = RGBColor(230, 230, 230)
        shape.text = "[Image Insert]\n(Full Bleed)"
        shape.text_frame.paragraphs[0].font.color.rgb = THEME['colors']['primary']

    # -------------------------------------------------------------------------
    # DYNAMIC CHART & KPI ENGINE
    # -------------------------------------------------------------------------
    def _clean_data_values(self, raw_data):
        """
        Ensures chart data is numeric. Handles strings, lists of strings,
        or 'smashed' regex strings (e.g. '4251.810908.2').
        """
        cleaned = []
        if isinstance(raw_data, list):
            for item in raw_data:
                if isinstance(item, (int, float)):
                    cleaned.append(item)
                elif isinstance(item, str):
                    # Regex to find numbers
                    nums = re.findall(r"[-+]?\d*\.\d+|\d+", item)
                    cleaned.extend([float(n) for n in nums])
        elif isinstance(raw_data, str):
            nums = re.findall(r"[-+]?\d*\.\d+|\d+", raw_data)
            cleaned.extend([float(n) for n in nums])
        return cleaned

    def _render_chart_from_data(self, slide, plot_data, x, y, w, h):
        """Generates a chart from a specific zplotting data object."""
        chart_data = CategoryChartData()
        
        # 1. Categories
        categories = plot_data.get("labels", [])
        chart_data.categories = categories
        
        # 2. Series Data
        # We need to limit data points to match categories length to avoid errors
        limit = len(categories)
        
        for ds in plot_data.get("datasets", []):
            raw_vals = ds.get("data", [])
            clean_vals = self._clean_data_values(raw_vals)
            # Truncate or pad
            final_vals = clean_vals[:limit]
            chart_data.add_series(ds.get("label", "Series"), final_vals)

        # 3. Detect Chart Type
        ptype = plot_data.get("type", "column_chart").lower()
        if "line" in ptype:
            xl_type = XL_CHART_TYPE.LINE
        elif "bar" in ptype:
            xl_type = XL_CHART_TYPE.BAR_CLUSTERED
        else:
            xl_type = XL_CHART_TYPE.COLUMN_CLUSTERED

        # 4. Create Chart
        chart = slide.shapes.add_chart(xl_type, x, y, w, h, chart_data).chart

        # 5. Styling
        chart.has_title = True
        chart.chart_title.text_frame.text = plot_data.get("title", "Growth Trend")
        chart.chart_title.text_frame.paragraphs[0].font.size = Pt(11)
        chart.chart_title.text_frame.paragraphs[0].font.bold = True
        
        chart.has_legend = True
        chart.legend.position = XL_LEGEND_POSITION.BOTTOM
        chart.legend.include_in_layout = False
        chart.legend.font.size = Pt(9)

    def _render_kpi_card(self, slide, kpi_data, x, y, w, h):
        """Renders a single KPI card."""
        # Handle structure: "data": [{"label": "...", "value": "..."}]
        inner_data = kpi_data.get("data", [{}])[0]
        value = str(inner_data.get("value", "N/A"))
        label = inner_data.get("label", "Metric")

        # Card Shape
        shape = slide.shapes.add_shape(MSO_SHAPE.ROUNDED_RECTANGLE, x, y, w, h)
        shape.fill.solid()
        shape.fill.fore_color.rgb = THEME['colors']['light_grey']
        shape.line.color.rgb = THEME['colors']['accent_cyan']
        shape.line.width = Pt(1)

        # Value
        tf = shape.text_frame
        p = tf.paragraphs[0]
        p.text = value
        p.font.bold = True
        p.font.size = Pt(14)
        p.font.color.rgb = THEME['colors']['primary']
        p.alignment = PP_ALIGN.CENTER

        # Label
        p2 = tf.add_paragraph()
        p2.text = label
        p2.font.size = Pt(9)
        p2.font.color.rgb = THEME['colors']['dark_grey']
        p2.alignment = PP_ALIGN.CENTER

    # -------------------------------------------------------------------------
    # SLIDE 1: BUSINESS PROFILE
    # -------------------------------------------------------------------------
    def create_slide_1(self):
        slide = self.prs.slides.add_slide(self.prs.slide_layouts[6])
        name = self.data.get("company_pseudonym", "Target Company")
        self._add_master_elements(slide, f"Business Profile: {name}")

        # 1. Thesis / Overview
        thesis = self.data.get("investment_thesis_summary", "")
        bullets = [s.strip() for s in re.split(r'(?<=[.])\s', thesis) if s][:6]
        
        self._create_content_box(
            slide, Inches(0.5), Inches(1.6), Inches(7.5), Inches(3.0),
            "EXECUTIVE SUMMARY", bullets
        )

        # 2. Strategy
        strat = self.data.get("strategic_assessment", {})
        strat_points = [
            f"Moat: {strat.get('economic_moat', 'N/A')}",
            f"Position: {strat.get('market_position', 'N/A')}",
            f"Risk: {strat.get('key_risks', ['N/A'])[0]}"
        ]
        self._create_content_box(
            slide, Inches(0.5), Inches(4.8), Inches(7.5), Inches(2.2),
            "STRATEGIC OVERVIEW", strat_points
        )

        # 3. Visuals (Right)
        # Placeholder for images
        self._place_image(slide, f"/workspaces/codespaces-blank/PROJECT KELP/sector_visuals/{sector}/image_1.jpg", Inches(8.2), Inches(1.6), Inches(4.6), Inches(2.5))
        self._place_image(slide, f"/workspaces/codespaces-blank/PROJECT KELP/sector_visuals/{sector}/image_2.jpg", Inches(8.2), Inches(4.3), Inches(4.6), Inches(2.7))

    # -------------------------------------------------------------------------
    # SLIDE 2: FINANCIALS (UPDATED FOR ZPLOTTING)
    # -------------------------------------------------------------------------
    def create_slide_2(self):
        slide = self.prs.slides.add_slide(self.prs.slide_layouts[6])
        self._add_master_elements(slide, "Financial Performance & Scale")

        # --- 1. DYNAMIC DISCOVERY ---
        # Scan 'zplotting' specifically for chart objects and KPI cards
        zplots = self.data.get("zplotting", {})
        
        kpi_list = []
        chart_list = []

        # Iterate through zplotting keys
        for key, item in zplots.items():
            itype = item.get("type", "").lower()
            
            # Identify KPIs
            if "kpi" in itype:
                kpi_list.append(item)
            
            # Identify Charts (Column, Bar, Line)
            elif any(x in itype for x in ["line_chart", "bar_chart"]):
                chart_list.append(item)

        # --- 2. RENDER KPIs (Top Row) ---
        # We can fit about 4-5 cards
        start_x = Inches(0.5)
        card_w = Inches(2.4) 
        gap = Inches(0.2)
        
        for i, kpi_item in enumerate(kpi_list[:5]): # Limit to 5
            x = start_x + (i * (card_w + gap))
            self._render_kpi_card(slide, kpi_item, x, Inches(1.6), card_w, Inches(0.9))

        # --- 3. RENDER CHARTS (Middle Row) ---
        # Layout: Split width 50/50 for up to 2 charts
        chart_y = Inches(2.8)
        chart_h = Inches(3.2)
        

        # random_integer = random.randint(1, 10)
        
        # Chart 1 Position (Left)
        if len(chart_list) >= 1:
            self._render_chart_from_data(
                slide, chart_list[2], Inches(0.5), chart_y, Inches(6.0), chart_h
            )
        else:
            # Fallback if no charts found
            tb = slide.shapes.add_textbox(Inches(0.5), chart_y, Inches(6.0), chart_h)
            tb.text = "No Chart Data Available"

        # Chart 2 Position (Right)
        if len(chart_list) >= 2:
            self._render_chart_from_data(
                slide, chart_list[3], Inches(6.8), chart_y, Inches(6.0), chart_h
            )

        # --- 4. FINANCIAL COMMENTARY (Bottom) ---
        fin_health = self.data.get("financial_health", {})
        comm_points = [
            fin_health.get("revenue_quality", ""),
            fin_health.get("margin_trajectory_analysis", ""),
            fin_health.get("solvency_risk", "")
        ]
        # Filter out empty strings
        comm_points = [c for c in comm_points if c]
        
        self._create_content_box(
            slide, Inches(0.5), Inches(6.1), Inches(12.3), Inches(1.0),
            "FINANCIAL ANALYSIS", comm_points[:2]
        )

    # -------------------------------------------------------------------------
    # SLIDE 3: HIGHLIGHTS
    # -------------------------------------------------------------------------
    def create_slide_3(self):
        slide = self.prs.slides.add_slide(self.prs.slide_layouts[6])
        self._add_master_elements(slide, "Investment Highlights")

        # Dynamic Text Sourcing
        highlights = []
        
        # 1. Try 'slide_blocks' from JSON first
        blocks = self.data.get("slide_blocks", [])
        for block in blocks:
            if "highlight" in block.get("title", "").lower():
                highlights = block.get("bullets", [])
                break
        
        # 2. Fallback to 'growth_catalysts' if slide_blocks missing
        if not highlights:
            highlights = self.data.get("strategic_assessment", {}).get("growth_catalysts", [])

        # Grid Layout for Highlights
        col_w = Inches(6.0)
        row_h = Inches(1.3)
        start_x = Inches(0.5)
        start_y = Inches(1.8)

        for i, text in enumerate(highlights[:6]):
            col = i % 2
            row = i // 2
            x = start_x + (col * (col_w + Inches(0.4)))
            y = start_y + (row * (row_h + Inches(0.2)))

            # Number Box
            icon = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, x, y, Inches(0.5), Inches(0.5))
            icon.fill.solid()
            icon.fill.fore_color.rgb = THEME['colors']['accent_cyan']
            icon.line.fill.background()
            
            p = icon.text_frame.paragraphs[0]
            p.text = str(i + 1)
            p.font.bold = True
            p.font.color.rgb = THEME['colors']['primary']
            p.alignment = PP_ALIGN.CENTER

            # Text Box
            tb = slide.shapes.add_textbox(x + Inches(0.6), y, col_w - Inches(0.6), row_h)
            p = tb.text_frame.paragraphs[0]
            p.text = text
            p.font.size = Pt(12)
            p.font.color.rgb = THEME['colors']['dark_grey']
            tb.text_frame.word_wrap = True

    def save(self):
        self.prs.save(self.output_file)
        print(f"✅ Generated Presentation: {self.output_file}")

# =============================================================================
# RUNNER
# =============================================================================
if __name__ == "__main__":
    # 1. Define Input File (Ensure this matches your actual JSON filename)
    json_path = f"/workspaces/codespaces-blank/PROJECT KELP/layer2_withplot{company}.json"
    
    # 2. Run Generation
    if os.path.exists(json_path):
        gen = KelpDynamicGenerator(json_path)
        gen.create_slide_1()
        gen.create_slide_2()
        gen.create_slide_3()
        gen.save()
    else:
        print(f"File not found: {json_path}")

✅ Generated Presentation: Kelp_Auto_TeaserKSolves.pptx


In [20]:
company

'centum'