In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
import os

In [4]:
from supabase import create_client, Client

In [8]:
supabase_url = os.environ.get("SUPABASE_URL")

In [6]:
supabase_api_key = os.environ.get("SUPBASE_KEY")

In [7]:
supabase: Client = create_client(supabase_url, supabase_api_key)

In [35]:
import anthropic

In [36]:
client_anthropic = anthropic.Anthropic(api_key = os.getenv('CLAUDE_API_KEY'))

In [37]:
import json

In [92]:
import re


In [70]:
def fetch_result(date, next_date):
    result = supabase.table('webhooks')\
    .select('id', 'news_output', 'source_urls', 'news_date', 'monitor_type', 'created_at')\
    .gte('created_at', date)\
    .lt('created_at', next_date)\
    .order('created_at', desc=False)\
    .execute()
    
    unique_results = []
    
    for i in result.data:
        current_news = i['news_output']
        if not any(item['news_output'] == current_news for item in unique_results):
            unique_results.append(i)
    
    return unique_results

In [72]:
today_result = fetch_result('2026-02-21', '2026-02-22')

today_result

[{'id': 1297,
  'news_output': 'The Guardian (citing the Financial Times) reports on large proposed investments into OpenAI (coverage of Nvidia and other potential investors) and notes earlier reporting of a larger $100bn figure that was later reduced in subsequent coverage — a major funding/ownership development in the AI sector.',
  'source_urls': ['https://www.theguardian.com/technology/2026/feb/20/nvidia-investment-openai-chatgpt-funding-round-ai-artificial-intelligence'],
  'news_date': '2026-02-21',
  'monitor_type': 'general',
  'created_at': '2026-02-21T03:09:01.779963+00:00'},
 {'id': 1298,
  'news_output': 'Coverage of the India AI Impact Summit reports large infrastructure-linked investment commitments (over $250 billion cited by Indian ministers) and an AI Impact Summit declaration signed by major countries — significant policy and investment-level commitments affecting AI deployment and international collaboration.',
  'source_urls': ['https://m.economictimes.com/news/news

In [93]:
def claude_article_builder(researched_data):
    def extract_json_block(text: str) -> str:
        text = text.strip()
        if text.startswith("```"):
            text = text.split("\n", 1)[1].rsplit("```", 1)[0].strip()
        return text

    def escape_quotes_inside_output(json_text: str) -> str:
        """
        Escapes unescaped double-quotes that appear inside the value of "output": "..."
        Targets ONLY the output field so we don't accidentally corrupt JSON structure.
        """
        i = 0
        n = len(json_text)
        out = []

        while i < n:
            j = json_text.find('"output"', i)
            if j == -1:
                out.append(json_text[i:])
                break

            out.append(json_text[i:j])
            i = j

            # copy '"output"'
            out.append('"output"')
            i += len('"output"')

            # match : " (with optional whitespace)
            m = re.match(r'\s*:\s*"', json_text[i:])
            if not m:
                # unexpected structure; just continue scanning
                continue

            out.append(m.group(0))
            i += m.end()

            # now we're inside the output string content
            escaped = False
            while i < n:
                ch = json_text[i]

                if escaped:
                    out.append(ch)
                    escaped = False
                    i += 1
                    continue

                if ch == '\\':
                    out.append(ch)
                    escaped = True
                    i += 1
                    continue

                if ch == '"':
                    # Could be end-of-string OR an unescaped quote inside content.
                    # Heuristic: if next non-space char is one of , } ] then it's end-of-string.
                    k = i + 1
                    while k < n and json_text[k].isspace():
                        k += 1

                    if k < n and json_text[k] in [',', '}', ']']:
                        out.append('"')  # end of output string
                        i += 1
                        break
                    else:
                        out.append('\\"')  # escape inner quote
                        i += 1
                        continue

                out.append(ch)
                i += 1

        return ''.join(out)

    def safe_load_llm_json(text: str):
        cleaned = extract_json_block(text)

        # Quick truncation sanity checks (helps you catch max_tokens / streaming cut-offs early)
        if cleaned.count('{') != cleaned.count('}') or (cleaned.count('"') % 2 != 0):
            raise ValueError(
                "Model output looks truncated (unbalanced braces/quotes). "
                "Increase max_tokens or reduce output size."
            )

        try:
            return json.loads(cleaned)
        except json.JSONDecodeError:
            # Repair the most common failure you hit: unescaped quotes inside "output" values
            repaired = escape_quotes_inside_output(cleaned)
            return json.loads(repaired)

    response = client_anthropic.messages.create(
        model="claude-sonnet-4-5",
        max_tokens=10000,
        messages=[{
            "role": "user",
            "content": f"""
        You are the editor of Krux, a news platform that helps GenZ stay informed about AI & tech in under 60 seconds.
        The readers of your platform are:
        1. AI & tech enthusiasts, who want to read and stay updated about AI
        2. Product Managers, Marketing managers, designers, software engineers who want to stay updated about the trends about AI
        and how they can use AI in their lives. 
        3. Investors and founders who want to know about major market shifts and competitive dynamics in AI — 
        not every funding round, only the ones that signal something bigger.
        
        This is your job:
        1. Out of all the articles that our news aggregator has collected you need to select 15-16 interesting topics to cover,
        these will then be sent to a research assistant to do a deeper analysis on.
        2. Your job is JUST AGGREGATION, you are NOT supposed to change anything or any source URLs.
        3. In case there are the EXACT same topic you should combine both of them and merge the source + the brief.
        4. And you need to categorise each of the news pieces into:
        a. Funding 
        b. Model announcements/enhancements
        c. Workflow improvement
        d. Report
        e. Others

        NON-NEGOTIABLE AGGREGATION RULES:
          1. DO NOT paraphrase, rewrite, summarize, shorten, or expand any selected item's `output`.
          2. For each selected item, `output` must be copied EXACTLY from one raw `news_output` entry.
          3. If combining duplicate stories, you may only:
             - keep one `output` exactly as-is from one chosen base item
             - append additional `sources` URLs from exact duplicates
             - never synthesize or merge text across multiple outputs
          4. If no exact duplicate exists, keep the item as a 1:1 copy.
        
       SELECTION CRITERIA (in priority order):
        1. "Would a working professional (PM, engineer, designer, marketer) share this in their team Slack?" — 
        if yes, this story is HIGH priority regardless of category.
        2. "Does this change how someone works, builds, or makes decisions this week?" — if yes, HIGH priority.
        3. "Is this just a funding announcement with no product or strategic insight?" — 
        if yes, LOW priority. Only include if the round signals a major market shift (e.g., $1B+ rounds that reshape competitive dynamics).
        4. Funding stories about companies most readers haven't heard of should almost never make the cut 
        unless the technology itself is breakthrough.
        
        BALANCE CHECK (soft):
        After selecting your 15-16 stories, count the funding stories. 
        If more than 5 are pure funding with no product angle, drop the weakest ones and replace with stories from other categories.
        
          Then you need to put it in a STRICT JSON which is as follows (not per-item only)

          
          ANTI-HALLUCINATION OUTPUT RULES (MANDATORY):
          1. Every selected item must include:
             - "id" (integer) copied from input row
             - "news_date" (YYYY-MM-DD) copied from input row
             - "output" copied EXACTLY from that same row's `news_output`
          2. Do not invent or transform ids, dates, outputs, or URLs.
          3. "sources" must be built only from that row’s `source_urls` (or exact-duplicate rows if merged).
          4. If any item cannot satisfy exact match, do not select it.
          
          EXAMPLE:
        
          {{{{
        "selected_total": 15,
        "mix_summary": {{{{
          "funding": 5,
          "model_announcements_enhancements": 3,
          "workflow_improvement": 4,
          "report": 2,
          "others": 1
        }}}},
        "selection_notes": "Used 9/6 split because only 6 high-confidence practical stories were available.",
        "items": [
          {{{{
        "output": "...",
        "sources": [{{{{"name": "...", "url": "..."}}}}],
        "topic": "Funding",
        "id": 1309,
        "news_date": '2026-02-21'
          }}}}
        ]
          }}}}

          

          Here is the JSON file with all the raw news articles collected:

          {researched_data}
          
          """
        }]
    )

    output = response.content[0].text
    print(output.strip()) 

    article_json = safe_load_llm_json(output)
    return article_json

In [None]:
claude_article_builder(today_result)