# Install Requirements

In [None]:
!pip install "codeinterpreterapi[all]"
!pip install chromadb
#!conda install -c conda-forge faiss-cpu
!pip install sentence-transformers PyPDF2
!pip install -U langchain-community
!pip install langchain faiss-cpu
!pip install pypdf
!pip install tavily-python
!pip install PyMuPDF

Collecting chromadb
  Using cached chromadb-1.0.11-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Using cached fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-4.2.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Using cached opentelemetry_api-1.33.1-py3-none-any.whl.metadata (1.6 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using cached opentelemetry_exporter_otlp_proto_grpc-1.33.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from ch

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Coll

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Stanford AI Index PDF Parser

In [None]:
import os
import torch
from langchain.document_loaders import PyMuPDFLoader  # 더 나은 PDF 처리
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer
from typing import List

In [None]:
class SentenceTransformerEmbeddings(Embeddings):
    """LangChain용 Embeddings 래퍼: sentence-transformers 모델을 사용."""
    def __init__(self, model_name: str = "BAAI/bge-m3"):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = SentenceTransformer(model_name).to(device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True).tolist()


def ingest_pdf_to_faiss(
    pdf_path: str,
    index_dir: str = "faiss_index",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
):
    """
    1. PDF 로드 → 페이지별 Document 생성
    2. 텍스트 청크 분할
    3. SentenceTransformer로 임베딩
    4. FAISS 인덱스 생성 및 저장
    """
    print(f"[1] PDF 로드 중: {pdf_path}")
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()

    print(f"[2] 텍스트 청크 분할 중 (chunk_size={chunk_size}, overlap={chunk_overlap})")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)

    print(f"[3] 임베딩 모델 준비 중...")
    embeddings = SentenceTransformerEmbeddings("BAAI/bge-m3")

    print(f"[4] FAISS 인덱스 생성 중...")
    vector_store = FAISS.from_documents(chunks, embeddings)

    print(f"[5] 로컬 디렉토리에 인덱스 저장: {index_dir}")
    os.makedirs(index_dir, exist_ok=True)
    vector_store.save_local(index_dir)
    print(f"\n✅ 완료: '{index_dir}'에 FAISS 인덱스 저장됨.")

In [None]:
if __name__ == "__main__":
    ingest_pdf_to_faiss(
        pdf_path="/content/drive/MyDrive/P4DS/hai_ai_index_report_2025.pdf",
        index_dir="/content/drive/MyDrive/P4DS",
        chunk_size=1200,
        chunk_overlap=200
    )


[1] PDF 로드 중: /content/drive/MyDrive/P4DS/hai_ai_index_report_2025.pdf
[2] 텍스트 청크 분할 중 (chunk_size=1200, overlap=200)
[3] 임베딩 모델 준비 중...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


KeyboardInterrupt: 

In [None]:
def load_faiss_and_query(index_dir: str, query: str, top_k: int = 5):
    """저장된 FAISS 인덱스를 로드한 뒤 `query`로 검색해 결과를 출력 (테스트용)."""
    embeddings = SentenceTransformerEmbeddings("BAAI/bge-m3")
    vector_store = FAISS.load_local(
        index_dir,
        embeddings,
        allow_dangerous_deserialization=True  # ← 핵심 수정
    )

    docs_and_scores = vector_store.similarity_search_with_score(query, k=top_k)

    print(f"\n[검색 결과] \"{query}\" (top-{top_k})")
    print("=" * 80)
    for rank, (doc, score) in enumerate(docs_and_scores, 1):
        snippet = doc.page_content.replace("\n", " ")[:200] + "..."
        print(f"{rank:>2}. score={score:.4f} | {snippet}")
    print("=" * 80)


if __name__ == "__main__":
    PDF_PATH = "/content/hai_ai_index_report_2025.pdf"
    INDEX_DIR = "/content/drive/MyDrive/P4DS"

    # 인덱스 없으면 새로 생성
    if not os.path.isdir(INDEX_DIR):
        ingest_pdf_to_faiss(
            pdf_path=PDF_PATH,
            index_dir=INDEX_DIR,
            chunk_size=1200,
            chunk_overlap=200,
        )

    # 간단 질의
    for q in [
        "What is trend of AI Research in 2025?"
    ]:
        load_faiss_and_query(INDEX_DIR, q, top_k=5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]


[검색 결과] "What is trend of AI Research in 2025?" (top-5)
 1. score=0.5729 | Artificial Intelligence Index Report 2025...
 2. score=0.6445 | Table of Contents 38 Artificial Intelligence Index Report 2025 Chapter 1 Preview By Topic Machine learning was the most prevalent research topic in  AI publications in 2023, comprising 75.7% of public...
 3. score=0.6808 | Table of Contents 30 Artificial Intelligence Index Report 2025 Chapter 1 Preview publications more than doubled, rising from approximately  102,000 in 2013 to more than 242,000 in 2023. The increase  ...
 4. score=0.6824 | 225 Artificial Intelligence Index Report 2025 Table of Contents Chapter 4 Preview 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 0.00% 0.20% 0.40% 0.60% 0.80% 1.00% AI job ...
 5. score=0.6842 | Artificial Intelligence Index Report 2025 1 Welcome to the eighth edition of the AI Index report. The 2025 Index is our most comprehensive to date and arrives at an  important moment, as AI’s

In [None]:
def get_faiss_results(index_dir: str, query: str, top_k: int = 5) -> dict:
    """FAISS 인덱스를 로드하여 검색 결과를 반환합니다."""
    embeddings = SentenceTransformerEmbeddings("BAAI/bge-m3")
    vector_store = FAISS.load_local(
        index_dir,
        embeddings,
        allow_dangerous_deserialization=True
    )

    docs_and_scores = vector_store.similarity_search_with_score(query, k=top_k)

    return {
        "documents": [doc.page_content for doc, _ in docs_and_scores],
        "scores": [score for _, score in docs_and_scores]
    }


# Initialize

밑 from codeinterpreterapi import CodeInterpreterSession, settings 에서 오류가 날 경우 실행

In [None]:
# 충돌 패키지 제거
!pip uninstall -y pydantic pydantic-settings pydantic-core langchain langchain-core langchain-community

# pydantic v1 및 langchain 안정 버전 재설치
!pip install "pydantic<2.0"
!pip install "pydantic-settings<2.0"
!pip install "langchain==0.0.350"
!pip install codeinterpreterapi==0.1.20


Found existing installation: pydantic 1.10.22
Uninstalling pydantic-1.10.22:
  Successfully uninstalled pydantic-1.10.22
Found existing installation: pydantic-settings 2.9.1
Uninstalling pydantic-settings-2.9.1:
  Successfully uninstalled pydantic-settings-2.9.1
Found existing installation: pydantic_core 2.33.2
Uninstalling pydantic_core-2.33.2:
  Successfully uninstalled pydantic_core-2.33.2
Found existing installation: langchain 0.3.25
Uninstalling langchain-0.3.25:
  Successfully uninstalled langchain-0.3.25
Found existing installation: langchain-core 0.3.62
Uninstalling langchain-core-0.3.62:
  Successfully uninstalled langchain-core-0.3.62
Found existing installation: langchain-community 0.3.24
Uninstalling langchain-community-0.3.24:
  Successfully uninstalled langchain-community-0.3.24
Collecting pydantic<2.0
  Using cached pydantic-1.10.22-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (154 kB)
Using cached pydantic-1.10.22-cp311-cp311-manylinux_2_17_x86_64

Collecting pydantic-settings<2.0
  Downloading pydantic_settings-1.99-py3-none-any.whl.metadata (3.8 kB)
Collecting pydantic==v2.0a3 (from pydantic-settings<2.0)
  Downloading pydantic-2.0a3-py3-none-any.whl.metadata (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic-core==0.25.0 (from pydantic==v2.0a3->pydantic-settings<2.0)
  Downloading pydantic_core-0.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Downloading pydantic_settings-1.99-py3-none-any.whl (9.4 kB)
Downloading pydantic-2.0a3-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.8/193.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_core-0.25.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?

In [None]:
# === OpenAI & Tavily 설정 ===
from openai import OpenAI
from tavily import TavilyClient

import os

# ChromaDB (RAG) 예시
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

# ------------------------------
# 환경 변수에서 OPENAI_API_KEY 읽기 (로컬/Colab 등에 맞게 설정)
openai_api_key = os.environ.get("OPENAI_API_KEY", "sk-proj-")
client = OpenAI(api_key=openai_api_key)

# Tavily API 키
TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "tvly-dev-")
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)

# === ChromaDB 초기화 (예시) ===
DB_PATH = "my_chromadb_folder"  # 예: "./db" 또는 Google Drive 경로 등
client_chroma = chromadb.PersistentClient(path=DB_PATH)

# 임베딩 함수 설정 (OpenAI Embedding 사용 예시)
openai_embedding = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embed\ding-ada-002"
)

# 'startup_collection' 컬렉션 준비
collection = client_chroma.get_or_create_collection(
    name="startup_collection",
    embedding_function=openai_embedding
)

# === Visualization 초기화 (예시) ===
import os
from codeinterpreterapi import CodeInterpreterSession, settings # validation error가 뜬다면 커널을 재시작 한 후 기다렸다가 다시 시도

#openai_api_key = os.environ.get("OPENAI_API_KEY", "sk-proj-...")
settings.OPENAI_API_KEY=openai_api_key
settings.MODEL = "gpt-4.1-mini"

# CSV 파일 db 저장


In [None]:
import os
import pandas as pd
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from sentence_transformers import SentenceTransformer

openai_api_key = os.environ.get("OPENAI_API_KEY", "sk-proj-...")

DB_PATH = "/content/drive/MyDrive/my_chromadb_folder"
#DB_PATH="./my_chromadb_folder"

client_chroma = chromadb.PersistentClient(path=DB_PATH)

local_embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

collection = client_chroma.get_or_create_collection(
    name="startup_collection",
    embedding_function=local_embedding_fn
)

db_path = "/content/drive/MyDrive/P4DS/new_DB.xlsx"
#db_path = "./new_DB.xlsx"
df = pd.read_excel(db_path)

documents = []
metadatas = []

for idx, row in df.iterrows():
    description = str(row['설명']) if pd.notna(row['설명']) else ""
    proposal = str(row['제안서']) if pd.notna(row['제안서']) else ""
    summary = str(row['요약']) if pd.notna(row['요약']) else ""
    document_text = description + " " + summary

    metadata = row.to_dict()

    documents.append(document_text)
    metadatas.append(metadata)

collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=[str(i) for i in range(len(documents))]
)

print("✅ Saved to ChromaDB (using local embedding)")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Saved to ChromaDB (using local embedding)


In [None]:
from IPython.display import display

# 설명과 요약을 일정 길이로 축약
def shorten(text, max_len=80):
    if pd.isna(text):
        return ""
    return text[:max_len] + "..." if len(text) > max_len else text

df_view = df.copy()
df_view['설명'] = df_view['설명'].apply(lambda x: shorten(str(x), 100))
df_view['제안서'] = df_view['제안서'].apply(lambda x: shorten(str(x), 100))
df_view['요약'] = df_view['요약'].apply(lambda x: shorten(str(x), 60))

# 보기 좋은 열 순서 선택
selected_columns = [
    '회사명', '설명', '제안서', '요약',
    '교육', 'AI/딥테크/블록체인', '콘텐츠/예술', '음악'
]

# 숫자 태그(0, 1)를 "✅" or "" 로 바꾸기
def bool_icon(val):
    return "✅" if val == 1 else ""

for col in ['교육', 'AI/딥테크/블록체인', '콘텐츠/예술', '음악']:
    if col in df_view.columns:
        df_view[col] = df_view[col].apply(bool_icon)

# 스타일 지정해서 보기 좋게 표시
styled = df_view[selected_columns].style \
    .set_properties(**{'text-align': 'left'}) \
    .set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}]) \
    .set_caption("스타트업 DB (요약 보기)")

display(styled)

Unnamed: 0,회사명,설명,제안서,요약,교육,AI/딥테크/블록체인,콘텐츠/예술,음악
0,퀄슨,기업명: 퀄슨 설명: 모바일 기반 영어회화 콘텐츠 플랫폼 '리얼클래스' '슈퍼팬' 등을 운영하는 기업 상장 여부: 비상장 창립일: 2012-06-18 웹사이트: qualson.co...,"회사명: 퀄슨 (QUALSON) 설립연도: 2012년 6월, 서울 강남구 비전: “모바일에서 누구나 실생활 영어회화를 자유롭게 익히도록.” 퀄슨은 모바일 기반 영어회화 콘텐츠 ...",퀄슨은 2012년 6월에 설립된 모바일 기반 영어회화 콘텐츠 플랫폼 '리얼클래스' '슈퍼팬' 등을 운영하는 ...,✅,✅,✅,✅
1,에이치투케이,기업명: 에이치투케이 설명: AI 기반 아동 대상 한글 교육 플랫폼 '소중한글'을 운영하는 기업 상장 여부: 비상장 창립일: 2017-07-14 웹사이트: sojunghangeul...,"회사명: 에이치투케이 (H2K) 설립연도: 2017년 7월, 대전 유성구 비전: “AI로 유아의 한글 교육을 혁신한다.” 에이치투케이는 인공지능 기반 유아 한글 교육 플랫폼 ‘...",에이치투케이는 2017년 7월에 설립된 AI 기반 아동 대상 한글 교육 플랫폼 '소중한글'을 운영하는 기업 ...,✅,✅,✅,
2,플랭,기업명: 플랭 설명: AI 기반 영어회화 트레이닝 플랫폼 '플랭'을 운영하는 기업 상장 여부: 비상장 창립일: 2019-06-21 웹사이트: plang.ai 주소: 서울 서초구 방...,"회사명: 플랭 (PLANG) 설립연도: 2019년 6월, 서울 서초구 비전: “AI로 영어 말하기 학습을 일상화한다.” 플랭은 AI 기반 영어회화 트레이닝 플랫폼으로, 사용자의...",플랭은 2019년 6월에 설립된 AI 기반 영어회화 트레이닝 플랫폼 '플랭'을 운영하는 기업 비즈니스다. 비...,✅,✅,✅,
3,비브리지,기업명: 비브리지 설명: AI 기반 미디어/콘텐츠 기업 대상 AI 더빙 제작 대행 서비스 '비브리지AI' 및 AI 기반 동영상 강의용 필기 소프트웨어 '슬리드'를 제공하는 기업 상...,"회사명: 비브리지 (BeBridge) 설립연도: 2020년 8월, 서울 강남구 비전: “AI로 콘텐츠 제작과 학습 기록을 자동화한다.” 비브리지는 미디어 및 교육 콘텐츠 제작을...",비브리지는 2020년 8월에 설립된 AI 기반 미디어/콘텐츠 기업 대상 AI 더빙 제작 대행 서비스 '비브리...,✅,✅,✅,
4,베스핀글로벌,기업명: 베스핀글로벌 설명: 머신러닝·AI 기반 기업대상으로 클라우드 서비스를 제공하는 기업 상장 여부: 비상장 창립일: 2015-10-19 웹사이트: www.bespingloba...,"회사명: 베스핀글로벌 (Bespin Global) 설립연도: 2015년 10월, 서울 서초구 비전: “AI로 기업의 클라우드 운영을 지능화한다.” 베스핀글로벌은 머신러닝과 AI...",베스핀글로벌은 2015년 10월에 설립된 머신러닝·AI 기반 기업대상으로 클라우드 서비스를 제공하는 기업 비...,,✅,,
5,메가존클라우드,기업명: 메가존클라우드 설명: 클라우드 관리 서비스를 운영하는 기업 상장 여부: 비상장 창립일: 2018-07-03 웹사이트: megazone.com 주소: 서울 강남구 논현로85...,"회사명: 메가존클라우드 (Megazone Cloud) 설립연도: 2018년 7월, 서울 강남구 비전: “클라우드 인프라의 표준을 이끄는 MSP 선도 기업.” 메가존클라우드는 클...",메가존클라우드는 2018년 7월에 설립된 클라우드 관리 서비스를 운영하는 기업 비즈니스다. 비상장 상태이다....,,,,
6,클루커스,기업명: 클루커스 설명: 클라우드 서비스 기반 고객 컨설팅 및 관리서비스를 제공하는 기업 상장 여부: 비상장 창립일: 2018-12-21 웹사이트: www.cloocus.com 주...,"회사명: 클루커스 (Cloocus) 설립연도: 2018년 12월, 서울 강남구 비전: “클라우드 전환의 복잡함을 고객 중심 컨설팅으로 단순화하다.” 클루커스는 클라우드 인프라 ...",클루커스는 2018년 12월에 설립된 클라우드 서비스 기반 고객 컨설팅 및 관리서비스를 제공하는 기업 비즈니...,,,,
7,마음에이아이,기업명: 마음에이아이 설명: AI모델 생성 플랫폼 '마음AI'를 운영하는 기업 상장 여부: 상장 창립일: 2014-01-08 웹사이트: maum.ai 주소: 경기 성남시 분당구 대...,"회사명: 마음에이아이 (Maum AI) 설립연도: 2014년 1월, 경기 성남시 분당구 비전: “누구나 AI를 직접 만들 수 있도록.” 마음에이아이는 사용자가 직접 AI 모델을...",마음에이아이는 2014년 1월에 설립된 AI모델 생성 플랫폼 '마음AI'를 운영하는 기업 비즈니스다. 상장 ...,,✅,,
8,씨유박스,"기업명: 씨유박스 설명: AI 기반 생체 보안 시스템 및 솔루션을 제공하는 기업 유사 기업: 컴퓨터메이트, 씨앤알테크, 에어콕, 에이엘아이, 엘텍코리아, 이노비드 소비자 성별 및 ...","회사명: 씨유박스 (CUBOX) 비전: “AI 기반 생체 보안으로 사람과 공간의 안전을 혁신하다.” 씨유박스는 안면인식, 지문인식, 복합 생체 인증 기술을 활용해 비접촉 보안 시...",씨유박스는 2010년 6월에 설립된 AI 기반 생체 보안 시스템 및 솔루션을 제공하는 기업 비즈니스다. 상장...,,✅,,
9,딥핑소스,기업명: 딥핑소스 설명: 머신러닝 기반 데이터 수집 및 보안 기술을 개발하는 기업 상장 여부: 비상장 창립일: 2018-06-01 웹사이트: www.deepingsource.io ...,"회사명: 딥핑소스 (Deeping Source) 설립연도: 2018년 6월, 서울 강남구 비전: “데이터 보호와 활용을 동시에 실현하는 AI 보안 기술의 표준.” 딥핑소스는 머...",딥핑소스는 2018년 6월에 설립된 머신러닝 기반 데이터 수집 및 보안 기술을 개발하는 기업 비즈니스다. 비...,,✅,,


# Main Code

In [None]:
import json
import os

# === 10개 항목별 초기 Score/Confidence (None) ===
report_card = {
    "Clarity of Vision":       {"score": None, "confidence": None},
    "Product-Market Fit":      {"score": None, "confidence": None},
    "Competitive Advantage":   {"score": None, "confidence": None},
    "Team Competency":         {"score": None, "confidence": None},
    "Go-to-Market Strategy":   {"score": None, "confidence": None},
    "Customer Understanding":  {"score": None, "confidence": None},
    "Financial Readiness":     {"score": None, "confidence": None},
    "Scalability Potential":   {"score": None, "confidence": None},
    "Traction & KPIs":         {"score": None, "confidence": None},
    "Fundraising Preparedness":{"score": None, "confidence": None},
}

CONFIDENCE_THRESHOLD = 80

def collection_query(query_texts, n_results, db_type="startup"):
    """
    db_type에 따라 startup(Chroma) 또는 stanford(FAISS)에서 유사 문서 검색.
    반환: dict - documents, scores (optional)
    """
    if db_type == "startup":
        # ChromaDB 검색
        import chromadb
        import chromadb.utils.embedding_functions as embedding_functions

        client_chroma = chromadb.PersistentClient(path="./my_chromadb_folder")
        embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )

        collection = client_chroma.get_collection(
            name="startup_collection",
            embedding_function=embedding_fn
        )

        results = collection.query(
            query_texts=query_texts,
            n_results=n_results
        )

        return {
            "documents": results["documents"],
            "metadatas": results["metadatas"],
            "distances": results.get("distances", [])
        }

    elif db_type == "stanford":
        faiss_result = get_faiss_results(
            index_dir="./faiss_index",  # 인덱스 저장 경로
            query=query_texts[0],
            top_k=n_results
        )
        return {
            "documents": [faiss_result["documents"]],  # 통일된 리스트 형태
            "scores": faiss_result["scores"]
        }

    else:
        raise ValueError(f"Unknown db_type: {db_type}")


def all_criteria_above_threshold(report: dict, threshold: int) -> bool:
    """
    모든 항목의 confidence가 threshold 이상인지 체크.
    None이면 threshold를 달성했다고 볼 수 없으므로 False.
    """
    for _, v in report.items():
        if v['confidence'] is None or v['confidence'] < threshold:
            return False
    return True

def print_report(report: dict):
    """
    report_card를 사람이 읽기 좋게 프린트.
    None이면 'N/A'로 표시
    """
    print("\n===== 현재 스타트업 진단 보고서 =====")
    i = 1
    for criteria, data in report.items():
        score_str = data['score'] if data['score'] is not None else "N/A"
        conf_str = data['confidence'] if data['confidence'] is not None else "N/A"
        print(f"{i}. {criteria}: {score_str} / 5점 (Confidence: {conf_str}%)")
        i += 1
    print("==================================\n")


def llm_call(system_prompt: str, user_prompt: str, temperature: float = 0.7) -> str:
    """
    OpenAI API를 호출해 system+user 프롬프트로부터 답변을 생성.
    """
    completion = client.chat.completions.create(
        model="gpt-4.1-mini",  # 모델명은 예시 (적절히 교체 가능)
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=temperature
    )
    return completion.choices[0].message.content.strip()


def parse_report_card_json(json_str: str) -> dict:
    """
    LLM이 준 JSON을 파싱해, 10개 키가 모두 있는지, 각 value에 "score","confidence"가 있는지 검사.
    """
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError:
        return None

    if not isinstance(data, dict):
        return None

    required_keys = list(report_card.keys())
    if len(data.keys()) != 10:
        return None
    for k in required_keys:
        if k not in data:
            return None
        if not isinstance(data[k], dict):
            return None
        if "score" not in data[k] or "confidence" not in data[k]:
            return None

    return data


def update_report_card(report: dict, new_data: dict):
    """
    report_card의 score/confidence를 new_data로 갱신
    """
    for criterion in report.keys():
        report[criterion]["score"] = new_data[criterion]["score"]
        report[criterion]["confidence"] = new_data[criterion]["confidence"]


def search_internet(query: str) -> str:
    response = tavily_client.search(query)
    if response:
        return response
    else:
        return f"[Internet] '{query}'에 대한 검색 결과가 없습니다."

def search_db(query: str, db_type: str) -> str:
    """
    ChromaDB에서 query_texts=[query]로 검색
    db_type에 맞추어 검색
    """
    results = collection_query(query_texts=[query], n_results=3, db_type=db_type)
    docs = results.get("documents", [[]])[0]
    if docs:
        joined_docs = "\n".join([f"- {d}" for d in docs])
        return f"[DB 검색 결과 - {db_type}]\n{joined_docs}"
    else:
        return f"[DB 검색 결과 - {db_type}] '{query}' 관련 문서가 없습니다."

def refine_criterion_output(criterion: str, all_context: str) -> str:
    """
    'criterion' 항목에 대해 보완할 점을 먼저 추출한 뒤,
    그 보완점을 반영하여 다시 작성된 텍스트를 최종 반환한다.
    """

    # 1) LLM으로부터 개선(보완) 포인트를 먼저 받아온다.
    system_prompt_1 = (
        "You are an AI assistant analyzing a specific section of a startup business report.\n"
        "Your task is to identify any weaknesses or areas for improvement in the text related to this criterion.\n"
        "List them clearly so we can address them in the next step.\n\n"
        "Return these suggested improvements in plain text (e.g., bullet points)."
    )
    user_prompt_1 = (
        f"Criterion to refine: '{criterion}'\n\n"
        f"Here is all the context collected so far:\n{all_context}\n\n"
        "Please list the points or areas that should be improved, clarified, or expanded upon for this criterion."
    )
    improvement_points = llm_call(system_prompt_1, user_prompt_1, temperature=0.7)

    # 2) LLM에게, 위에서 받은 개선 포인트를 반영해 더 깊고 구체적인 텍스트로 다시 작성해달라고 요청한다.
    system_prompt_2 = (
        "You are an AI assistant refining a specific section of a startup business report.\n"
        "You have a list of improvements to address.\n"
        "Use them to produce a revised, more detailed discussion for this criterion, "
        "providing clarity, depth, and actionable insights.\n\n"
        "Return the refined explanation in plain text (no JSON)."
    )
    user_prompt_2 = (
        f"Criterion to refine: '{criterion}'\n\n"
        f"Improvement points:\n{improvement_points}\n\n"
        f"Here is the context again:\n{all_context}\n\n"
        "Incorporate the listed improvements into the final refined text."
    )
    refined_text = llm_call(system_prompt_2, user_prompt_2, temperature=0.7)

    return refined_text

def analyze_and_visualize(all_context: str) -> dict:
    """
    CodeInterpreter로 하여금 3C 관점 (e.g., Company, Customer and Competitors)에서 시장을 분석하고, 이를 시각화 하도록 함.
    """

    # 1) LLM으로부터 visualization을 위한 프롬프트를 받아오기
    system_prompt_1 = (
    "You are an AI assistant tasked with supporting the creation of a business report.\n"
    "Your focus is on evaluating the market perspective within the 3C framework (Customer, Competitor, Company).\n"
    "Your goal is to identify relevant data points that should be explored and recommend prompts for further analysis and data visualization.\n\n"
    "Structure your response using the following format:\n"
    "[Task] A brief description of the specific analysis or insight to be developed.\n\n"
    "[Data] A bullet-point list of the key data elements required to support the analysis.\n\n"
    "[Chart Specs] A bullet-point list of suggested visualizations, including the type of chart and what variables to compare."
    )
    all_context = "AI Healthcare startup in the U.S.A. In series A, the item is analzing users' biometrics using medical mobiel devices"
    user_prompt_1 = (
    f"Here is the current context and information gathered about a startup company:\n{all_context}\n\n"
    "Based on this context, please identify areas that need to be improved, clarified, or expanded in order to strengthen the market analysis under the 3C framework."
    )
    user_input = llm_call(system_prompt_1, user_prompt_1, temperature=0.7)
    user_input += "\n[Output] A high‑resolution PNG (≈ 1920×1080) suitable for presentations."


    PROMPT_1 = (
        "You are a highly skilled data analyst assigned to address the user's query using reliable, authoritative, and verifiable data sources.\n\n"
        "Your objectives:\n"
        "- Acquire accurate, relevant, and reputable datasets that directly support the user’s inquiry.\n"
        "- Structure the data in a clean, well-formatted pandas DataFrame with appropriate data types and labels.\n"
        "- Conduct a meaningful, insight-driven analysis that provides direct, data-backed answers to the user’s question.\n"
        "- Create compelling, well-labeled visualizations using matplotlib to effectively communicate your findings.\n\n"
        "Visualization Requirements:\n"
        "- Every chart must include:\n"
        "  • A clear and descriptive title.\n"
        "  • Properly labeled axes and legends, where applicable.\n"
        "  • Exact names of companies, institutions, or reports referenced in the data.\n"
        "  • Visible citation of the data source with URL.(e.g., in a corner of the plot).\n\n"
        "Analysis Explanation:\n"
        "- Write a concise and insightful narrative summarizing:\n"
        "  • The methodology and key findings.\n"
        "  • Any observed trends, comparisons, or outliers.\n"
        "  • The broader implications of the analysis.\n"
        "- Provide full URLs to all data sources used in your analysis to ensure traceability and transparency.\n\n"
        "Important Notes:\n"
        "- Only use data from trustworthy, recognized sources (e.g., official government portals, academic institutions, reputable organizations).\n"
        "- Ensure the analysis is self-contained, reproducible, and directly aligned with the user’s request.\n\n"
        "User Query: {user_input}\n"
    )

    # # (2) Tavily search API를 통해서 이미 retrieval한 데이터를 기반으로, 이를 시각화 하는 경우
    # PROMPT_2 = (
    # "You are a data analyst. Based on the dataset below, analyze the topic '{query}' and create a clear visualization. \n"
    # "\t- The dataset is in pandas DataFrame format and contains the following: {response} \n"
    # "\t- Provide a concise explanation of your analysis. \n"
    # "\t- Generate the visualization using matplotlib. \n"
    # "\t- Make sure to include font settings in the code to properly display Korean characters (e.g., use 'Malgun Gothic' on Windows or 'AppleGothic' on macOS). \n"
    # "\t- Also include the setting to prevent minus signs from breaking when rendering Korean text in matplotlib. \n"
    # )

    with CodeInterpreterSession() as session:
        # generate a response based on user input
        response = session.generate_response(
            PROMPT_1.format(user_input = user_input)
        )
        # output the response
        response.show()

    # # (2) Tavily search API를 통해서 이미 retrieval한 데이터를 기반으로, 이를 시각화 하는 경우
    # # query = "AI 헬스케어 시장 성장률"
    # query = "AI healthcare market growth"
    # response = search_internet(query)
    # print(response)

    # with CodeInterpreterSession() as session:
    #     response = session.generate_response(
    #         PROMPT_2.format(query = query, response = response)
    #     )
    #     response.show()


    # save visualized images
    response_txt = response.content  # collected_contexts에 추가 가능
    response_img = response.files[0]  # 최종 레포트 생성 시 파싱

    save_dir = "./figures"
    os.makedirs(save_dir, exist_ok=True)  # 경로가 없으면 생성

    save_path = os.path.join(save_dir, "figure.png")
    response_img.save(save_path)

    # figure 자체 대신 figure 경로를 반환해 최종 보고서에서 활용
    return {
        "analysis": response_txt,
        "figure_path": save_path
    }

def generate_db_query(criterion: str, all_context: str, db_type: str) -> str:
    """
    LLM에게 해당 criterion과 컨텍스트 기반으로 DB에 사용할 적절한 검색 쿼리를 생성하게 함.
    """
    db_desc = "real-world Korean startup company examples" if db_type == "startup" else "AI technology trends and statistics"

    system_prompt = (
        "You are an AI assistant generating database search queries for a business report.\n"
        f"The database is focused on {db_desc}.\n"
        "Given the criterion and the context so far, generate a specific search query that will help retrieve relevant documents.\n"
        "Only return the search query. No explanations, no JSON."
    )

    user_prompt = (
        f"Criterion: {criterion}\n\n"
        f"Context:\n{all_context}\n\n"
        "Search query:"
    )

    query = llm_call(system_prompt, user_prompt, temperature=0.3)
    return query.strip()

def generate_user_question_for_criterion(criterion: str, all_context: str) -> str:
    """
    LLM에게:
      '해당 criterion을 개선하기 위해 사용자에게 어떤 세부 정보를 물어봐야 하는지'
    를 묻는다. LLM이 구체적인 질문 문장을 반환.
    """
    system_prompt = (
        "You are an AI assistant helping gather specific user input. "
        "Given the context and the chosen criterion, generate a short list of specific questions "
        "the user should answer in detail. The output should be plain text (no JSON)."
    )
    user_prompt = (
        f"Criterion of focus: '{criterion}'\n\n"
        f"Context so far:\n{all_context}\n\n"
        "Based on what is missing or uncertain for this criterion, "
        "create a short set of bullet-point questions for the user to answer. "
        "Be as concrete as possible."
    )
    question_text = llm_call(system_prompt, user_prompt, temperature=0.7)
    return question_text


def generate_internet_search_query(criterion: str, all_context: str) -> str:
    """
    LLM에게: '해당 criterion 관련해서 인터넷에서 어떤 키워드를 검색해야
    필요한 정보를 얻을 수 있는지'를 물어봄.
    """
    system_prompt = (
        "You are an AI assistant that decides the best internet search query "
        "to gather more information about a certain criterion in a startup business report.\n"
        "Return ONLY the recommended search query in plain text (no JSON)."
    )
    user_prompt = (
        f"Criterion of focus: '{criterion}'\n\n"
        f"Context so far:\n{all_context}\n\n"
        "Based on what's missing or uncertain for this criterion, propose a concise search query "
        "that would help gather the most relevant insights or data from the internet."
    )
    suggested_query = llm_call(system_prompt, user_prompt, temperature=0.7)
    return suggested_query.strip()

def perform_action(action: str, target_criteria: str, collected_contexts: list) -> str:
    """
    LLM이 결정한 action을 실제 수행하여 결과 텍스트를 반환.
    결과 텍스트는 그대로 collected_contexts에 추가되어
    이후 보고서 업데이트/분석에 활용된다.

    * 여기서 반환 문자열 앞에 식별자를 붙여줌으로써,
      나중에 generate_business_report에서 구조적으로 활용 가능하도록 함.
    """
    if action == "AskUser":
        # 1) LLM에게 '무엇을 구체적으로 물어봐야 하는가'를 요청
        full_context_str = "\n".join(collected_contexts)
        question_prompt = generate_user_question_for_criterion(target_criteria, full_context_str)

        # 2) 질문을 화면에 출력
        print("\n[LLM Generated Questions]")
        print(question_prompt)
        print("\n아래 질문에 대한 답을 입력해주세요.")

        # 3) 사용자 입력 받기
        user_answer = input("Your Answer: ")

        # 4) 반환 문자열 구성 (식별자: USER_INPUT)
        return f"USER_INPUT: (User Provided Info about {target_criteria})\nQuestions:\n{question_prompt}\nUser's Answer:\n{user_answer}"

    elif action == "SearchDB_startup":
        full_context_str = "\n".join(collected_contexts)
        query = generate_db_query(target_criteria, full_context_str, db_type="startup")
        db_result = search_db(query, db_type="startup")
        return f"DB_SUMMARY: {db_result}"

    elif action == "SearchDB_report":
        full_context_str = "\n".join(collected_contexts)
        query = generate_db_query(target_criteria, full_context_str, db_type="stanford")
        db_result = search_db(query, db_type="stanford")
        return f"DB_SUMMARY: {db_result}"

    elif action == "SearchInternet":
        # 1) LLM에게 '인터넷에서 검색할 query'를 생성해달라고 요청
        full_context_str = "\n".join(collected_contexts)
        suggested_query = generate_internet_search_query(target_criteria, full_context_str)

        # 2) 실제 인터넷 검색 수행
        net_result = search_internet(suggested_query)

        # 3) 결과를 반환 (검색어 + 검색 결과) (식별자: INTERNET_SUMMARY)
        return f"INTERNET_SUMMARY: (Internet Search Query: '{suggested_query}')\n{net_result}"

    elif action == "RefineOutput":
        # RefineOutput 시, LLM 추가 호출
        full_context_str = "\n".join(collected_contexts)
        refined_text = refine_criterion_output(target_criteria, full_context_str)
        # 식별자: REFINED_OUTPUT
        return f"REFINED_OUTPUT: (Refined Output about {target_criteria})\n{refined_text}"

    elif action == "AnalyzeAndVisualize":
        full_context_str = "\n".join(collected_contexts)
        analysis_with_figure = analyze_and_visualize(collected_contexts)
        # 이제 figure_path도 함께 반환
        return (
            "AnalyzeAndVisualize:\n"
            f"3C_ANALYSIS: {analysis_with_figure['analysis']}\n"
            f"FIGURE_PATH: {analysis_with_figure['figure_path']}"
        )

    elif action == "NoActionNeeded":
        return "(No further actions required.)"
    else:
        return "(알 수 없는 액션)"


def ask_llm_for_next_action(
    report: dict,
    collected_texts: list,
    action_history: list
) -> dict:
    """
    LLM에게 “다음 액션” + “어느 항목(criterion)인지” + "왜 그 액션을 골랐는지(rationale)"를
    JSON 형식으로 받는다.
    """
    system_prompt = (
        "You are an AI assistant finalizing a startup's business report.\n\n"
        "You have 7 possible actions:\n"
        " 1) AskUser       : Need more specific details from user\n"
        " 2) SearchDB_startup     : Need real-world Korean startup company examples\n"
        " 3) SearchDB_report    : Need AI technology trend, statistics information\n"
        " 4) SearchInternet: Need external info from the web\n"
        " 5) RefineOutput  : Have enough info, want to refine/improve writing\n"
        " 6) AnalyzeAndVisualize : Perform data analysis and generate visualizations using retrieved user and market information.\n"
        " 7) NoActionNeeded: Everything is sufficiently addressed\n\n"
        "When deciding, consider any info gaps or low confidence in the 10 criteria.\n\n"
        "Return your decision in JSON with EXACTLY these three keys:\n"
        "  \"criterion\"  -> one of the 10 criteria, or \"None\" if no focus\n"
        "  \"action\"     -> one of [AskUser, SearchDB_startup, SearchDB_report, SearchInternet, RefineOutput, NoActionNeeded]\n"
        "  \"rationale\"  -> a short sentence explaining why you chose this action.\n\n"
        "No extra keys, no disclaimers, no additional text. ONLY JSON."
    )

    def sc_str(d):
        return f"Score={d['score'] if d['score'] is not None else 'N/A'}, Confidence={d['confidence'] if d['confidence'] is not None else 'N/A'}%"

    report_summary = "\n".join([
        f"{k}: {sc_str(v)}"
        for k, v in report.items()
    ])

    accumulated_context = "\n---\n".join(collected_texts)

    # 액션 히스토리를 텍스트로 합침
    action_history_text = "\n".join([
        f"[{i+1}] {entry}"
        for i, entry in enumerate(action_history)
    ])

    user_prompt = (
        f"Current report state:\n{report_summary}\n\n"
        f"Action history so far:\n{action_history_text}\n\n"
        f"Collected contexts:\n{accumulated_context}\n\n"
        "Which single criterion is the biggest priority now, and which action is most appropriate?\n"
        "Also provide a short rationale explaining your choice.\n"
        "Important: Output EXACTLY and ONLY JSON in the following format:\n\n"
        "{\n"
        "  \"criterion\": \"<one_of_the_10_criteria_or_None>\",\n"
        "  \"action\": \"<AskUser_or_SearchDB_startup_or_SearchDB_report_or_SearchInternet_or_RefineOutput_or_NoActionNeeded>\",\n"
        "  \"rationale\": \"<short_reason>\"\n"
        "}\n"
    )

    max_tries = 3
    for attempt in range(max_tries):
        raw = llm_call(system_prompt, user_prompt, temperature=0.0)
        print("[ask_llm_for_next_action] Raw LLM Output:\n", raw)  # 디버그 출력

        try:
            action_data = json.loads(raw.strip())
            # JSON 키 검사
            if ("criterion" in action_data) and ("action" in action_data) and ("rationale" in action_data):
                valid_actions = ["AskUser", "SearchDB_startup", "SearchDB_report", "SearchInternet", "RefineOutput", "AnalyzeAndVisualize", "NoActionNeeded"]
                if action_data["action"] in valid_actions:
                    return action_data
        except Exception:
            pass

        print(f"⚠️ 액션 JSON 형식 오류(시도 {attempt+1}/{max_tries}), 재시도합니다...")

    return None


def generate_business_report(report: dict, collected_texts: list) -> str:
    """
    단계 요약:
      1) 전체 맥락(DB, 인터넷, 유저 입력 등)을 활용해 '보고서 텍스트(설명 부분)'를 생성
      2) 최종 점수(Score)/신뢰도(Confidence)는 오직 user input만 근거하여 산출
      3) 마크다운 형식으로 최종 보고서 작성

    핵심:
      - 보고서 텍스트(설명)는 DB나 인터넷 요약도 참고해 좀 더 풍부하게 작성한다.
      - 하지만, 10개 기준별 점수는 "user input"만 근거로 한다.
    """

    # -- (1) collected_texts에서 식별자로 분류 --
    user_inputs = []
    db_summaries = []
    net_summaries = []
    refined_outputs = []
    analysis_outputs = []
    general_contexts = []
    figure_paths = []

    for c in collected_texts:
        if c.startswith("USER_INPUT:"):
            user_inputs.append(c[len("USER_INPUT:"):].strip())
        elif c.startswith("DB_SUMMARY:"):
            db_summaries.append(c[len("DB_SUMMARY:"):].strip())
        elif c.startswith("INTERNET_SUMMARY:"):
            net_summaries.append(c[len("INTERNET_SUMMARY:"):].strip())
        elif c.startswith("REFINED_OUTPUT:"):
            refined_outputs.append(c[len("REFINED_OUTPUT:"):].strip())
        elif c.startswith("AnalyzeAndVisualize"):
            lines = c.split("\n")
            analysis_txt = []
            for line in lines:
                if line.startswith("3C_ANALYSIS:"):
                    analysis_txt.append(line.replace("3C_ANALYSIS:", "").strip())
                elif line.startswith("FIGURE_PATH:"):
                    figure_paths.append(line.replace("FIGURE_PATH:", "").strip())
            analysis_outputs.append("\n".join(analysis_txt))
        else:
            general_contexts.append(c)

    # 참고용으로 구조화된 전체 맥락(설명용)
    structured_context = (
        f"**User Input**:\n{''.join(user_inputs)}\n\n"
        f"**DB Summaries**:\n{''.join(db_summaries)}\n\n"
        f"**Internet Summaries**:\n{''.join(net_summaries)}\n\n"
        f"**Refined Outputs**:\n{''.join(refined_outputs)}\n\n"
        f"**3C Analysis Outputs\n{''.join(analysis_outputs)}\n\n"
        f"**Other Contexts**:\n{''.join(general_contexts)}"
    )

    # report_card 요약 문자열 (디스플레이용)
    def sc_str(d):
        s = d['score'] if d['score'] is not None else 'N/A'
        c = d['confidence'] if d['confidence'] is not None else 'N/A'
        return f"Score={s}, Confidence={c}%"

    report_summary = "\n".join([
        f"{k}: {sc_str(v)}"
        for k, v in report.items()
    ])

    # ------------------------------------------------------
    # (1) 보고서 설명 텍스트 생성 (DB/인터넷도 참고)
    # ------------------------------------------------------
    system_prompt_1 = (
        "You are an AI assistant that creates business reports for startups.\n"
        "You have access to user input, as well as references from DB and the internet.\n"
        "Use ALL of that context to refine or improve the textual explanation of the report.\n"
        "However, do NOT provide any new scores or confidences here.\n"
        "Just generate the improved discussion/explanation in plain text."
    )
    user_prompt_1 = (
        f"Current report state:\n{report_summary}\n\n"
        "Below is the structured context collected so far:\n"
        f"{structured_context}\n\n"
        "Please provide an updated, more detailed explanation of the business report, "
        "incorporating any relevant insights from the references."
    )
    refined_report_text = llm_call(system_prompt_1, user_prompt_1, temperature=0.7)

    # ------------------------------------------------------
    # (2) 점수(Score)/신뢰도(Confidence) 산출 (오직 user input만 사용)
    # ------------------------------------------------------
    # user input들을 하나로 합침
    user_only_input_text = "\n".join(user_inputs).strip()
    if not user_only_input_text:
        user_only_input_text = "(No user input provided.)"

    system_prompt_2 = (
        "You are an AI assistant that updates the score and confidence of EXACTLY these 10 criteria:\n"
        "1) \"Clarity of Vision\"\n"
        "2) \"Product-Market Fit\"\n"
        "3) \"Competitive Advantage\"\n"
        "4) \"Team Competency\"\n"
        "5) \"Go-to-Market Strategy\"\n"
        "6) \"Customer Understanding\"\n"
        "7) \"Financial Readiness\"\n"
        "8) \"Scalability Potential\"\n"
        "9) \"Traction & KPIs\"\n"
        "10) \"Fundraising Preparedness\"\n\n"
        "IMPORTANT: For scoring and confidence, you must rely ONLY on the user's input below.\n"
        "Ignore any DB or internet data for the actual scoring.\n\n"
        "You MUST ONLY output valid JSON with these EXACT 10 keys. No more, no less, no renaming.\n"
        "Each key => {\"score\": (1~5), \"confidence\": (0~100)}. No extra text."
    )
    user_prompt_2 = (
        "Below is the user's input (the only source for your scoring):\n"
        f"{user_only_input_text}\n\n"
        "Now output ONLY JSON for the updated score/confidence. "
        "Use exactly the 10 keys listed. No extra keys or text."
    )

    new_report_data = None
    max_tries = 3
    for attempt in range(max_tries):
        raw_json_output = llm_call(system_prompt_2, user_prompt_2, temperature=0.0)
        print("[generate_business_report] Raw JSON from LLM:\n", raw_json_output)  # 디버그 로그

        parsed = parse_report_card_json(raw_json_output)
        if parsed is not None:
            new_report_data = parsed
            break
        else:
            print(f"⚠️ JSON 형식 오류(시도 {attempt+1}/{max_tries}), 재요청합니다...")

    if new_report_data:
        update_report_card(report, new_report_data)
    else:
        print("❌ 3회 시도 후에도 JSON 파싱 실패. report_card 업데이트를 건너뜁니다.")

    # ------------------------------------------------------
    # (3) 최종 '마크다운' 형태의 보고서 생성
    # ------------------------------------------------------
    system_prompt_3 = (
        "You are an AI assistant creating a final business report in Markdown format.\n"
        "We have 10 criteria, each with an updated Score and Confidence.\n\n"
        "The final report structure should be:\n"
        "# Startup Diagnostic Report\n"
        "## Introduction\n"
        "(A short overview of the startup's current status)\n\n"
        "## 3C Analysis\n"
        "### Company\n"
        "(Team, resources, culture, etc.)\n\n"
        "### Competitors\n"
        "(Competitive landscape)\n\n"
        "### Customers\n"
        "(Target segments, user needs, insights)\n\n"
        "## Criteria Evaluation\n"
        "For each of the 10 criteria, create a subsection:\n"
        "### {Criterion Name}\n"
        "- Score: X/5\n"
        "- Confidence: Y%\n"
        "- Rationale:\n"
        "  (Short explanation)\n\n"
        "## Conclusion\n"
        "(Summarize key findings and next steps)\n\n"
        "Only output valid Markdown."
    )

    updated_report_summary = "\n".join([
        f"{k}: {sc_str(v)}"
        for k, v in report.items()
    ])

    references_md = "## References\n"
    if user_inputs:
        references_md += "\n**User Input**\n"
        for i, ui in enumerate(user_inputs, 1):
            references_md += f"- User Input #{i}: {ui}\n"
    if db_summaries:
        references_md += "\n**DB Summaries (RAG)**\n"
        for i, dbs in enumerate(db_summaries, 1):
            references_md += f"- DB Ref #{i}: {dbs}\n"
    if net_summaries:
        references_md += "\n**Internet Summaries**\n"
        for i, ns in enumerate(net_summaries, 1):
            references_md += f"- Net Ref #{i}: {ns}\n"
    if refined_outputs:
        references_md += "\n**Refined Outputs**\n"
        for i, ro in enumerate(refined_outputs, 1):
            references_md += f"- Refined #{i}: {ro}\n"
    if analysis_outputs:
        references_md += "\n**3C Analysis Texts**\n"
        for i, ao in enumerate(analysis_outputs, 1):
            references_md += f"- Analysis #{i}: {ao}\n"

    system_prompt_4 = (
        "You are an AI assistant. You have a preliminary references list from user, DB, internet, etc.\n"
        "You also have the final business report context.\n"
        "Your task: read all references, decide which are most relevant or supportive for the final report, and return them in bullet format.\n"
        "If certain references are not directly relevant or redundant, you may omit them.\n\n"
        "Output only the references you consider relevant for the final report.\n"
        "For each item, provide a concise explanation (1~2 lines) of why it is relevant.\n"
    )

    user_prompt_4 = (
        f"Final Report (draft):\n{refined_report_text}\n\n"
        f"Full references:\n{references_md}\n\n"
        "Please filter out any references that are not relevant or are repetitive.\n"
        "Return only the references you think are important for understanding or supporting this business report.\n"
    )

    filtered_references = llm_call(system_prompt_4, user_prompt_4, temperature=0.7)

    user_prompt_3 = (
        f"Updated report card:\n{updated_report_summary}\n\n"
        "Refined text:\n"
        f"{refined_report_text}\n\n"
        "Please produce a comprehensive markdown report with the structure above. "
        "Make sure to include the 3C Analysis and the 10 criteria."
    )

    final_markdown_report = llm_call(system_prompt_3, user_prompt_3, temperature=0.7)

    if figure_paths:
        final_markdown_report += "\n\n## Generated Figures\n"
        for fp in figure_paths:
            final_markdown_report += f"![Analysis Figure]({fp})\n"

    final_markdown_report += "\n\n## Relevant References\n"
    final_markdown_report += filtered_references

    return final_markdown_report

def discussion_loop(final_report: str, all_user_inputs: list):
    """
    (새로 추가됨)
    최종 보고서와 유저 입력들을 바탕으로 자유롭게 대화할 수 있는 함수.
    """
    print("\n=== [Discussion Mode] ===")
    print("최종 보고서 및 유저 입력을 바탕으로 자유롭게 대화가 가능합니다.")
    print("종료하려면 'quit'를 입력하세요.\n")

    while True:
        user_ask = input("User: ")
        if user_ask.strip().lower() == "quit":
            print("Discussion 종료.")
            break

        # LLM 호출 (예시)
        system_prompt = (
            "You are an AI assistant discussing the final startup report.\n"
            "You have the final markdown report and all user inputs.\n"
            "Answer any user questions or discuss further improvements.\n"
        )
        user_prompt = (
            f"Final Report:\n{final_report}\n\n"
            f"All User Inputs:\n{all_user_inputs}\n\n"
            f"User's question:\n{user_ask}"
        )
        answer = llm_call(system_prompt, user_prompt, temperature=0.7)
        print(f"Assistant: {answer}\n")

def main_business_report_loop():
    iteration_count = 0
    max_iterations = 10

    # 모든 정보 누적 리스트
    collected_contexts = []
    # 액션 이력 추적 리스트
    action_history = []

    # 1) 사용자 초기 정보
    initial_input = input("초기 스타트업 정보를 간략히 입력하세요: ")
    # 식별자: USER_INPUT (최초 입력)
    collected_contexts.append(f"USER_INPUT: {initial_input}")

    # 2) 첫 보고서 생성
    final_markdown = generate_business_report(report_card, collected_contexts)
    print("[초기 보고서]\n", final_markdown)

    while iteration_count < max_iterations:
        iteration_count += 1
        print(f"\n=== [Iteration {iteration_count}] ===")
        print_report(report_card)

        # (A) 모든 항목이 threshold 이상이면 종료
        if all_criteria_above_threshold(report_card, CONFIDENCE_THRESHOLD):
            print("✅ 모든 항목이 threshold를 달성했습니다. 최종 보고서를 출력합니다.")
            break

        # (B) LLM에게 “다음 액션” 질의
        action_data = ask_llm_for_next_action(report_card, collected_contexts, action_history)
        if action_data is None:
            print("❌ 3회 시도 후에도 액션 JSON 파싱 실패. 루프를 종료합니다.")
            break

        chosen_criterion = action_data["criterion"]
        chosen_action = action_data["action"]
        rationale = action_data["rationale"]

        print(f"[LLM 결정] 다음에 집중할 항목: {chosen_criterion}")
        print(f"[LLM 결정] 선택된 액션: {chosen_action}")
        print(f"[LLM 결정] 사유(rationale): {rationale}")

        if chosen_action == "NoActionNeeded":
            print("LLM이 NoActionNeeded를 선택했습니다. 보고서를 종료합니다.")
            break

        # (C) 액션 수행 → 새 정보 획득
        # if iteration_count == 1:
        #     chosen_action = "AnalyzeAndVisualize"
        action_result = perform_action(chosen_action, chosen_criterion, collected_contexts)
        print(f"[Action 결과] {action_result}")

        # 액션 이력에 추가
        action_log_text = (
            f"Iteration={iteration_count}, "
            f"Action={chosen_action}, "
            f"Criterion={chosen_criterion}, "
            f"Rationale={rationale}, "
            f"Result={action_result}"
        )
        action_history.append(action_log_text)

        # 결과를 context에도 추가
        collected_contexts.append(action_result)

        # (D) LLM으로 보고서 재작성
        final_markdown = generate_business_report(report_card, collected_contexts)
        print("[LLM 최종 보고서(마크다운)]\n", final_markdown)

    # 반복 종료 시점, 최종 출력
    print("\n=== 최종 스타트업 보고서 ===")
    print_report(report_card)
    print("보고서 생성이 완료되었습니다. ✅")

    # (F) 모든 작업 후, discussion_loop() 진입 (새 기능)
    all_user_inputs = []
    for ctx in collected_contexts:
        if ctx.startswith("USER_INPUT:"):
            all_user_inputs.append(ctx[len("USER_INPUT:"):].strip())
    discussion_loop(final_markdown, all_user_inputs)


if __name__ == "__main__":
    main_business_report_loop()


초기 스타트업 정보를 간략히 입력하세요: > Company: MediTalk AI >  >  > **Founded:** 2022, Seoul, Korea >  > **Vision:** *"We want to make hospital appointments easier."* >  > **Problem & Product-Market Fit:** >  > Making a hospital appointment is inconvenient, so we built an app that makes it easier. >  > **Competitive Advantage:** >  > Nicely designed app with a large list of hospitals. >  > **Team Competency:** >  > CEO studied computer science. 5 total team members. >  > **Go-to-Market Strategy:** >  > Launch on app stores and hope users download it. >  > **Customer Understanding:** >  > Targeting people who find going to the hospital annoying. >  > **Financial Readiness:** >  > No funding received yet. >  > **Scalability Potential:** >  > Plan to add hospitals nationwide. >  > **Traction & KPIs:** >  > 3,000 downloads, 100 daily active users. >  > **Fundraising Preparedness:** >  > Looking to raise investment. >
[generate_business_report] Raw JSON from LLM:
 {
  "Clarity of Vision": {"score": 3, "c