In [1]:
import os
import json
import google.generativeai as genai
from tavily import TavilyClient
from dotenv import load_dotenv
from IPython.display import display, Markdown, JSON
import google.generativeai as genai
import urllib.parse
import json
import os
import requests
import base64
from dotenv import load_dotenv
import os
import sys
from dotenv import load_dotenv



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()

# --- CONFIGURATION ---
# Replace these with your actual keys or ensure they are in your .env file
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

if not GEMINI_API_KEY or not TAVILY_API_KEY:
    print("‚ö†Ô∏è WARNING: Please set GEMINI_API_KEY and TAVILY_API_KEY in your environment or this cell.")

# Configure Clients
genai.configure(api_key=GEMINI_API_KEY)
#TODO experiment with different models
model = genai.GenerativeModel('gemini-2.5-pro') 
tavily = TavilyClient(api_key=TAVILY_API_KEY)

In [3]:
### --- BOLAGSVERKET AUTHENTICATION & API CAllS ---
## Not needed atm 

# Load environment variables from a .env file
load_dotenv()

CLIENT_ID = os.getenv("BOLAGSVERKET_CLIENT_ID", "YOUR_CLIENT_ID")
CLIENT_SECRET = os.getenv("BOLAGSVERKET_CLIENT_SECRET", "YOUR_CLIENT_SECRET")

# Endpoints documented by Bolagsverket
TOKEN_URL = "https://portal.api.bolagsverket.se/oauth2/token"
# Base URL found in documentation for V√§rdefulla datam√§ngder
API_BASE_URL = "https://gw.api.bolagsverket.se/vardefulla-datamangder/v1"

def get_access_token():
    """Authenticates with Bolagsverket and returns an access token."""
    
    # Encode client_id:client_secret in base64 for Basic Auth header
    creds = f"{CLIENT_ID}:{CLIENT_SECRET}"
    creds_b64 = base64.b64encode(creds.encode("utf-8")).decode("utf-8")

    headers = {
        "Authorization": f"Basic {creds_b64}",
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    data = {
        "grant_type": "client_credentials",
        # needed for read / ping access 
        "scope": "vardefulla-datamangder:read vardefulla-datamangder:ping" 
    }

    try:
        response = requests.post(TOKEN_URL, headers=headers, data=data)
        response.raise_for_status()
        token_data = response.json()
        return token_data["access_token"]
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching token: {e}")
        if response.content:
            print(f"Details: {response.content}")
        return None


def search_company(org_number, token):
    """
    Fetches company information using the provided organization number and access token.
    Args:
        org_number (str or int): The 10-digit tax number of the organization. If the input 
            is not 10 digits, it will be zero-padded to ensure the correct format.
        token (str): The access token required for authentication.
    Returns:
        dict: A dictionary containing the company information retrieved from the API, 
            if the request is successful.
        None: If the request fails, returns None and logs the error details.
    Notes:
        - The organization number should be provided without any dashes (e.g., "5560160680").
        - Ensure that the `API_BASE_URL` variable is defined and points to the correct API endpoint.
        - The function prints detailed error messages to help debug issues with the API response.
    """
    
    url = f"{API_BASE_URL}/organisationer"
    
    # ---------------------------------------------------------
    # Use tax id of comapny -> without
    # ---------------------------------------------------------
    payload = {
        "identitetsbeteckning": org_number
    }

    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching data: {e}")
        # Print detailed error to debug schema issues
        print(f"Response: {response.text}")
        return None
    


In [4]:
def generate_queries(company_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries based on the entity name.
    """
    input_data = f"""
    <user>
        <input>
            <company_info>
                <name>{company_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
            </company_info>
        </input>
    </user>
    """
    
    with open("../app/services/prompts/query_internet.xml", "r") as file:
        QUERY_INTERNET_PROMPT = file.read()
    full_prompt = QUERY_INTERNET_PROMPT + input_data
    
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        data = json.loads(response.text)
        # Extract just the query strings from the structured objects
        queries = [q["query"] for q in data.get("queries", [])]
        # Deduplicate
        return list(set(queries))
    except Exception as e:
        print(f"Error parsing Gemini response: {e}")
        return [f"{company_name} official website", f"{company_name} Sweden annual report"]

def perform_search(queries: list[str], query_number: int = 20):
    """
    Step 2: Execute search using Tavily API.
    """
    aggregated_context = []
    
    print(f"üîé Executing {query_number} out of {len(queries)} search queries...")
    
    # To save tokens/API credits, we might limit queries here
    for query in queries[:query_number]: # Limit to top k queries for this demo
        try:
            print(f"   -> Searching: '{query}'")
            # Tavily 'search' returns structured results with content
            response = tavily.search(query=query, search_depth="advanced", max_results=5)
            
            for result in response.get("results", []):
                aggregated_context.append(f"Source: {result['url']}\nContent: {result['content']}\n---")
        except Exception as e:
            print(f"   x Error searching '{query}': {e}")
            
    return "\n".join(aggregated_context)

def structure_data(company_name: str, search_context: str):
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    """
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context}
                </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
            </entity_context>
        </input>
    </user>
    """
    
    with open("../app/services/prompts/structure_data.xml", "r") as file:
        STRUCTURE_DATA_PROMPT = file.read()
    full_prompt = STRUCTURE_DATA_PROMPT + input_data
    #print(full_prompt)
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        return json.loads(response.text)
    except Exception as e:
        print(f"Error structuring data: {e}")
        return {"error": response.text}
    


def generate_queries_funds(fund_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries for a Fund/Investor.
    Uses the 'query_investor_web.xml' prompt.
    """
    # 1. Construct the Input XML strictly matching the prompt's expected structure
    input_data = f"""
    <user>
        <input>
            <investor_info>
                <name>{fund_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
                <country_code>SE</country_code>
            </investor_info>
            
            <missing_fields>
                <field>description</field>
                <field>investment_thesis</field>
                <field>sectors</field>
                <field>website</field>
                <field>key_people</field>
            </missing_fields>
        </input>
    </user>
    """
    
    # 2. Load the specific Investor Prompt
    # Note: Adjust path if running from a different directory (e.g. '../src/app/...' from notebooks)
    prompt_path = "../app/services/prompts/query_internet_funds.xml"
    
    try:
        with open(prompt_path, "r") as file:
            QUERY_INVESTOR_PROMPT = file.read()
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Warning: Prompt file not found at {prompt_path}. Check your path.")
        return [f"{fund_name} investment thesis", f"{fund_name} portfolio sectors"]

    full_prompt = QUERY_INVESTOR_PROMPT + input_data
    
    # 3. Call Gemini
    try:
        response = model.generate_content(
            full_prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        
        # 4. Parse Response
        data = json.loads(response.text)
        
        # Extract just the query strings
        queries = [q["query"] for q in data.get("queries", [])]
        
        # Deduplicate and return
        return list(set(queries))
        
    except Exception as e:
        print(f"Error generating/parsing fund queries: {e}")
        # Fallback queries if LLM fails
        return [
            f"{fund_name} investment thesis", 
            f"{fund_name} official website", 
            f"{fund_name} portfolio"
        ]
        
    

In [5]:
def structure_data_fund(company_name: str, search_context: str, org_id: str = None) -> dict:
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    Uses 'structure_data.xml' to extract fields like description, mission, key_people, etc.
    """
    # 1. Define path to the generic company prompt
    prompt_path = "../app/services/prompts/structure_data_fund.xml"
    
    # 2. Construct Input XML (matching the prompt's expected <user><input>... structure)
    # We provide basic Bolagsverket info if we have it, otherwise just the name/ID context.
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Organization Number: {org_id or "Unknown"}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context[:20000]} </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
                <known_org_number>{org_id or ""}</known_org_number>
            </entity_context>
        </input>
    </user>
    """
    
    try:
        # 3. Load the Prompt File
        with open(prompt_path, "r") as file:
            STRUCTURE_DATA_PROMPT = file.read()
            
        # 4. Combine and Generate
        full_prompt = STRUCTURE_DATA_PROMPT + input_data
        
        response = model.generate_content(
            full_prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        
        return json.loads(response.text)

    except FileNotFoundError:
        print(f"‚ùå Error: Prompt file not found at {prompt_path}")
        return {}
    except Exception as e:
        print(f"‚ùå Error structuring data for {company_name}: {e}")
        return {}

In [6]:
def run_pipeline(company_name: str):
    """
    Runs the data pipeline for the given company name.
    """
    print(f"Starting Scraper for: {company_name}\n")

    # Generate Queries
    queries = generate_queries(company_name)
    print("‚úÖ Generated Queries:")
    print(json.dumps(queries[:2], indent=2))

    # Scrape Web
    if queries:
        search_context = perform_search(queries, query_number=20)
        print(f"\n‚úÖ Retrieved {len(search_context)} characters of context.")
    else:
        search_context = ""
        print("‚ùå No queries generated.")

    # Structure Data
    if search_context:
        print("\nüß† Structuring data with Gemini...")
        structured_data = structure_data(company_name, search_context)
        
        print("\n‚ú® FINAL JSON OUTPUT:")
        display(JSON(structured_data))
    else:
        print("‚ùå Skipping structuring due to lack of context.")
    return structured_data


In [7]:
# RTODO delete cell
import re
from typing import Any, Dict
# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.db.neo4j_client import get_driver
# Import the query functions
from app.db.queries.company_queries import upsert_company
from app.db.queries.investor_queries import upsert_investor
from app.db.queries.relationship_queries import add_ownership

# =============================================================================
# Helper: ID Formatting (Safe for UUIDs)
# =============================================================================
def format_org_id(org_id: str) -> str:
    """
    Ensures Swedish organization numbers are formatted as xxxxxx-xxxx.
    SAFEGUARD: If the ID is a UUID (longer than 20 chars) or doesn't match 
    the 10-digit pattern, it returns the original value unchanged.
    """
    if not org_id:
        return org_id
    
    s_id = str(org_id)
    
    # 1. Safety Check: UUIDs are 36 chars. Swedish Org IDs are max 13 chars.
    if len(s_id) > 20:
        return s_id
    
    # 2. Clean: Remove non-digits to check raw length
    clean_id = re.sub(r'\D', '', s_id)
    
    # 3. Format: Only applying if strictly 10 digits
    if len(clean_id) == 10:
        return f"{clean_id[:6]}-{clean_id[6:]}"
    
    # Return original otherwise
    return s_id

# =============================================================================
# Main Ingestion Logic
# =============================================================================

def ingest_company_full(data: Dict[str, Any]) -> None:
    """
    Ingests a company and its shareholders (as Investors/Funds), 
    and links them using the add_ownership method.
    """
    
    # -------------------------------------------------------
    # 1. Prepare Main Company Data
    # -------------------------------------------------------
    raw_id = data.get("organization_id")
    if not raw_id:
        print(f"Skipping {data.get('name')}: No organization_id")
        return
        
    # Apply standard formatting (Safe for UUIDs)
    company_id = format_org_id(raw_id)

    company_data = {
        "company_id": company_id,
        "name": data.get("name"),
        "country_code": data.get("country_code", "SE"),
        "description": data.get("description", ""),
        "mission": data.get("mission", ""),
        "year_founded": data.get("year_founded"),
        "num_employees": data.get("num_employees"),
        "website": data.get("website", ""),
        "sectors": data.get("sectors", []),
        "aliases": data.get("aliases", []),
        "key_people": data.get("key_people", []),
    }

    # -------------------------------------------------------
    # 2. Upsert Main Company
    # -------------------------------------------------------
    try:
        upsert_company(company_data)
        print(f"‚úÖ Upserted Company: {company_data['name']} ({company_id})")
    except Exception as e:
        print(f"‚ùå Error upserting company: {e}")
        return

    # -------------------------------------------------------
    # 3. Process Shareholders (Shareholder OWNS Company)
    # -------------------------------------------------------
    shareholder_items = data.get("shareholders", [])
    if shareholder_items:
        print(f"   Processing {len(shareholder_items)} shareholders...")
        for item in shareholder_items:
            process_related_entity(item, main_company_id=company_id, relationship="shareholder") 


def process_related_entity(item: Dict[str, Any], main_company_id: str, relationship: str):
    """
    Helper to upsert a related entity (Fund/Investor) and link it.
    Updates the entity_id if an official Organization ID is found during the search.
    """
    name = item.get("name")
    if not name:
        return

    # A. Initial Setup
    # Start with provided ID (formatted if valid) or None
    initial_id = item.get("entity_id")
    if initial_id:
        initial_id = format_org_id(initial_id)
    
    investor_data = {
        "company_id": initial_id,
        "name": name,
        "country_code": "SE",
        "description": f"Ingested as {relationship} of {main_company_id}",
        "sectors": [],
        "website": "",
        "key_people": [],
        "investment_thesis": ""
    }

    # B. Agentic Enrichment Loop
    try:
        # 1. Generate Queries
        queries = generate_queries_funds(name)
        
        # 2. Perform Search
        if queries:
            raw_results = perform_search(queries, 20)
            
            # 3. Structure Data using the NEW method (structure_data_fund)
            if raw_results:
                enriched_info = structure_data_fund(raw_results, name, initial_id)
                
                if enriched_info:
                    # Filter empty values
                    clean_enriched = {k: v for k, v in enriched_info.items() if v}
                    investor_data.update(clean_enriched)
                    
                    # CRITICAL: If search found an organization_id, use it as the definitive ID
                    found_org_id = clean_enriched.get("organization_id")
                    
                    if found_org_id:
                        # Enforce format on the found ID (safe for UUIDs)
                        formatted_found_id = format_org_id(found_org_id)
                        investor_data["company_id"] = formatted_found_id
                        
                        # Preserve original name as alias if different
                        if "aliases" not in investor_data:
                            investor_data["aliases"] = []
                        if name != investor_data["name"] and name not in investor_data["aliases"]:
                            investor_data["aliases"].append(name)
                            
                        print(f"      üîç Found official ID for {name}: {formatted_found_id}")
                        
                    elif not investor_data.get("company_id"):
                        # If no official ID found and no initial ID, generate a UUID based on name
                        investor_data["company_id"] = str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
                        
                print(f"      ‚ú® Enriched '{investor_data.get('name')}': {len(investor_data.get('sectors', []))} sectors")

    except Exception as e:
        print(f"      ‚ö†Ô∏è Enrichment pipeline error for {name}: {e}")

    # C. Upsert & Link using the FINAL ID
    try:
        # Ensure we have a fall back ID if everything else failed
        if not investor_data.get("company_id"):
             investor_data["company_id"] = str(uuid.uuid5(uuid.NAMESPACE_DNS, name))

        # 1. Upsert using the standard imported method
        # Note: This will only save fields supported by the standard upsert_investor query
        upsert_investor(investor_data)
        
        # 2. Link based on relationship direction
        if relationship == "shareholder":
            # (Fund)-[:OWNS]->(Company)
            add_ownership(
                owner_id=investor_data["company_id"], 
                company_id=main_company_id,
                properties={"source": "llm_ingest"}
            )
            print(f"      ‚Ü≥ Linked Shareholder: {name} ({investor_data['company_id']})")

    except Exception as e:
        print(f"      ‚ùå Failed to DB sync '{name}': {e}")

In [8]:
import re
import logging

# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.services.portfolio_ingestion import ingest_company_with_portfolio
from app.services.portfolio_ingestion import lookup_org_number_from_web
from app.db.queries.relationship_queries import add_ownership

import re
# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.db.neo4j_client import get_driver
# Import the query functions
from app.db.queries.company_queries import upsert_company
from app.db.queries.investor_queries import upsert_investor
from app.db.queries.relationship_queries import add_ownership

# =============================================================================
# Helper: ID Formatting (Safe for UUIDs)
# =============================================================================
def format_org_id(org_id: str) -> str:
    """
    Ensures Swedish organization numbers are formatted as xxxxxx-xxxx.
    SAFEGUARD: If the ID is a UUID (longer than 20 chars) or doesn't match 
    the 10-digit pattern, it returns the original value unchanged.
    """
    if not org_id:
        return org_id
    
    s_id = str(org_id)
    
    # 1. Safety Check: UUIDs are 36 chars. Swedish Org IDs are max 13 chars.
    if len(s_id) > 20:
        return s_id
    
    # 2. Clean: Remove non-digits to check raw length
    clean_id = re.sub(r'\D', '', s_id)
    
    # 3. Format: Only applying if strictly 10 digits
    if len(clean_id) == 10:
        return f"{clean_id[:6]}-{clean_id[6:]}"
    
    # Return original otherwise
    return s_id

# =============================================================================
# Main Ingestion Logic
# =============================================================================

def ingest_company_full(data: Dict[str, Any]) -> None:
    """
    Ingests a company and its shareholders (as Investors/Funds), 
    and links them using the add_ownership method.
    """
    
    # -------------------------------------------------------
    # 1. Prepare Main Company Data
    # -------------------------------------------------------
    raw_id = data.get("organization_id")
    if not raw_id:
        print(f"Skipping {data.get('name')}: No organization_id")
        return
        
    # Apply standard formatting (Safe for UUIDs)
    company_id = format_org_id(raw_id)

    company_data = {
        "company_id": company_id,
        "name": data.get("name"),
        "country_code": data.get("country_code", "SE"),
        "description": data.get("description", ""),
        "mission": data.get("mission", ""),
        "year_founded": data.get("year_founded"),
        "num_employees": data.get("num_employees"),
        "website": data.get("website", ""),
        "sectors": data.get("sectors", []),
        "aliases": data.get("aliases", []),
        "key_people": data.get("key_people", []),
    }

    # -------------------------------------------------------
    # 2. Upsert Main Company
    # -------------------------------------------------------
    try:
        upsert_company(company_data)
        print(f"‚úÖ Upserted Company: {company_data['name']} ({company_id})")
    except Exception as e:
        print(f"‚ùå Error upserting company: {e}")
        return

    # -------------------------------------------------------
    # 3. Process Shareholders (Shareholder OWNS Company)
    # -------------------------------------------------------
    shareholder_items = data.get("shareholders", [])
    if shareholder_items:
        print(f"   Processing {len(shareholder_items)} shareholders...")
        for item in shareholder_items:
            process_related_entity(item, target_company_id=company_id) 


def process_related_entity(
    shareholder_name: str, 
    target_company_id: str, 
    ownership_pct: float = None
):
    """
    Process a single shareholder:
    1. Looks up their Org Number from the web.
    2. Triggers full portfolio ingestion for that shareholder (Recursive/Viral growth).
    3. Connects them to the target company (Shareholder -[OWNS]-> Target).
    """
    logger = logging.getLogger(__name__)
    # 1. Lookup Org Number using the helper function
    logger.info(f"Looking up organization number for shareholder: {shareholder_name}")
    shareholder_org_id = lookup_org_number_from_web(shareholder_name)
    
    if not shareholder_org_id:
        logger.warning(f"Could not find valid org number for shareholder '{shareholder_name}'. Skipping.")
        return None
        
    # 2. Ingest the shareholder fully
    # This calls the main entry point to download THEIR FI documents and build THEIR portfolio
    logger.info(f"Triggering full ingestion for shareholder: {shareholder_name} ({shareholder_org_id})")
    try:
        # We reuse the main ingestion function here
        ingest_company_with_portfolio(shareholder_org_id, shareholder_name)
    except Exception as e:
        logger.error(f"Error during ingestion of shareholder {shareholder_name}: {e}")
        # We continue even if ingestion fails, to at least try linking the node
        
    # 3. Create the relationship: Shareholder OWNS Target Company
    # (The ingest function above created the Node, now we link it 'down' to the current company)
    logger.info(f"Linking {shareholder_name} -> OWNS -> {target_company_id}")
    try:
        properties = {}
        if ownership_pct is not None:
            properties["share_percentage"] = float(ownership_pct)
            
        add_ownership(
            owner_id=shareholder_org_id,
            company_id=target_company_id,
            properties=properties
        )
        return shareholder_org_id
        
    except Exception as e:
        logger.error(f"Error creating ownership relationship for {shareholder_name}: {e}")
        return None

BeautifulSoup4 not available - Allabolag scraping will be limited


In [9]:
import re
import logging
import uuid
from typing import Any, Dict
# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.services.portfolio_ingestion import ingest_company_with_portfolio
from app.services.portfolio_ingestion import lookup_org_number_from_web
from app.db.queries.relationship_queries import add_ownership

import re
# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.db.neo4j_client import get_driver
# Import the query functions
from app.db.queries.company_queries import upsert_company
from app.db.queries.investor_queries import upsert_investor
from app.db.queries.relationship_queries import add_ownership

# =============================================================================
# Helper: ID Formatting (Safe for UUIDs)
# =============================================================================
def format_org_id(org_id: str) -> str:
    """
    Ensures Swedish organization numbers are formatted as xxxxxx-xxxx.
    SAFEGUARD: If the ID is a UUID (longer than 20 chars) or doesn't match 
    the 10-digit pattern, it returns the original value unchanged.
    """
    if not org_id:
        return org_id
    
    s_id = str(org_id)
    
    # 1. Safety Check: UUIDs are 36 chars. Swedish Org IDs are max 13 chars.
    if len(s_id) > 20:
        return s_id
    
    # 2. Clean: Remove non-digits to check raw length
    clean_id = re.sub(r'\D', '', s_id)
    
    # 3. Format: Only applying if strictly 10 digits
    if len(clean_id) == 10:
        return f"{clean_id[:6]}-{clean_id[6:]}"
    
    # Return original otherwise
    return s_id

# =============================================================================
# Main Ingestion Logic
# =============================================================================

def ingest_company_full(data: Dict[str, Any]) -> None:
    """
    Ingests a company and its shareholders (as Investors/Funds), 
    and links them using the add_ownership method.
    """
    
    # -------------------------------------------------------
    # 1. Prepare Main Company Data
    # -------------------------------------------------------
    raw_id = data.get("organization_id")
    if not raw_id:
        print(f"Skipping {data.get('name')}: No organization_id")
        return
        
    # Apply standard formatting (Safe for UUIDs)
    company_id = format_org_id(raw_id)

    company_data = {
        "company_id": company_id,
        "name": data.get("name"),
        "country_code": data.get("country_code", "SE"),
        "description": data.get("description", ""),
        "mission": data.get("mission", ""),
        "year_founded": data.get("year_founded"),
        "num_employees": data.get("num_employees"),
        "website": data.get("website", ""),
        "sectors": data.get("sectors", []),
        "aliases": data.get("aliases", []),
        "key_people": data.get("key_people", []),
    }

    # -------------------------------------------------------
    # 2. Upsert Main Company
    # -------------------------------------------------------
    try:
        upsert_company(company_data)
        print(f"‚úÖ Upserted Company: {company_data['name']} ({company_id})")
    except Exception as e:
        print(f"‚ùå Error upserting company: {e}")
        return

    # -------------------------------------------------------
    # 3. Process Shareholders (Shareholder OWNS Company)
    # -------------------------------------------------------
    shareholder_items = data.get("shareholders", [])
    if shareholder_items:
        print(f"   Processing {len(shareholder_items)} shareholders...")
        for item in shareholder_items:
            process_related_entity(item, target_company_id=company_id) 

def process_related_entity(
    item: Dict[str, Any], 
    target_company_id: str, 
    relationship: str = "shareholder"
):
    """
    Merged Logic Processor for Related Entities (Shareholders/Investors).
    """
    name = item.get("name")
    if not name:
        return None

    # Use print or logger consistently; referencing logger here for safety
    logger = logging.getLogger(__name__)
    
    # --- Step A: Initial Setup & Agentic Enrichment ---
    # (Assuming generate_queries_funds, perform_search, structure_data_fund are defined globally or imported)
    
    initial_id = item.get("entity_id")
    if initial_id:
        initial_id = format_org_id(initial_id)
        
    investor_data = {
        "company_id": initial_id,
        "name": name,
        "country_code": "SE",
        "description": f"Ingested as {relationship} of {target_company_id}",
        "sectors": [],
        "website": "",
        "key_people": [],
        "investment_thesis": ""
    }

    found_org_id = None

    # ... [Enrichment Logic remains the same] ...
    try:
        # Placeholder for your enrichment functions (ensure these are imported!)
        if 'generate_queries_funds' in globals():
            queries = generate_queries_funds(name)
            if queries:
                raw_results = perform_search(queries, 20)
                if raw_results:
                    enriched_info = structure_data_fund(raw_results, name, initial_id)
                    if enriched_info:
                        clean_enriched = {k: v for k, v in enriched_info.items() if v}
                        investor_data.update(clean_enriched)
                        detected_id = clean_enriched.get("organization_id")
                        if detected_id:
                            formatted_id = format_org_id(detected_id)
                            # Basic check for Swedish ID length (digits only)
                            clean_digits = re.sub(r'\D', '', formatted_id)
                            if len(clean_digits) == 10:
                                found_org_id = formatted_id
                                print(f"      üîç Agents found official Swedish ID for {name}: {found_org_id}")
    except Exception as e:
        print(f"      ‚ö†Ô∏è Enrichment pipeline error for {name}: {e}")

    # --- Step B: Ingestion Branching ---

    final_entity_id = None

    # PATH 1: Valid Swedish Org ID Found -> Full Viral Ingestion
    if found_org_id:
        # Set the final ID immediately, so we don't lose it if viral ingestion fails
        final_entity_id = found_org_id
        investor_data["company_id"] = found_org_id

        # 1. Try Viral Ingestion (BONUS STEP)
        try:
            print(f"      üöÄ Valid ID found ({found_org_id}). Attempting viral ingestion...")
            # We assume this function is imported correctly
            ingest_company_with_portfolio(found_org_id, investor_data["name"])
        except Exception as e:
            # FIX: If viral ingestion fails, LOG IT but DO NOT RETURN. Continue to link!
            logger.warning(f"      ‚ö†Ô∏è Viral ingestion failed for {name} (Non-fatal): {e}")

        # 2. Upsert the Investor Node (ESSENTIAL STEP)
        try:
            # Ensure enriched data (sectors, etc.) is saved, overwriting/merging with whatever ingest_company did
            upsert_investor(investor_data)
        except Exception as e:
            logger.error(f"      ‚ùå Critical: Failed to upsert investor node {name}: {e}")
            return None

    # PATH 2: No Official ID -> Simple Insert with UUID
    else:
        print(f"      üìâ No official Swedish ID found for {name}. Falling back to simple upsert.")
        
        if not investor_data.get("company_id"):
            investor_data["company_id"] = str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
        
        final_entity_id = investor_data["company_id"]
        
        try:
            upsert_investor(investor_data)
        except Exception as e:
            logger.error(f"      ‚ùå Failed to simple upsert '{name}': {e}")
            return None

    # --- Step C: Linking ---
    # This will now run even if Viral Ingestion crashed
    if final_entity_id:
        try:
            props = {"source": "llm_ingest"}
            if "ownership_pct" in item and item["ownership_pct"]:
                 props["share_percentage"] = float(item["ownership_pct"])

            if relationship == "shareholder":
                add_ownership(
                    owner_id=final_entity_id, 
                    company_id=target_company_id,
                    properties=props
                )
                print(f"      ‚Ü≥ Linked Shareholder: {name} ({final_entity_id}) -> {target_company_id}")
            
            return final_entity_id

        except Exception as e:
            print(f"      ‚ùå Failed to link '{name}': {e}")
            return None

In [None]:
### Run 
if __name__ == "__main__":
    famous_swedish_companies = [
        #"Spotify AB",
        #"IKEA",
        #"Volvo Group",
        #"H&M (Hennes & Mauritz)",
        "Ericsson",
        "Scania",
        #"Electrolux"
    ]
    
    for company in famous_swedish_companies:
        ingest_company_full(run_pipeline(company))

Starting Scraper for: Ericsson

‚úÖ Generated Queries:
[
  "Ericsson AB officiell hemsida",
  "Ericsson major shareholders investor relations"
]
üîé Executing 20 out of 27 search queries...
   -> Searching: 'Ericsson AB officiell hemsida'
   -> Searching: 'Ericsson major shareholders investor relations'
   -> Searching: 'Ericsson AB syfte och v√§rderingar'
   -> Searching: 'what is Ericsson company description'
   -> Searching: 'Ericsson antal anst√§llda'
   -> Searching: 'Ericsson purpose and values site:ericsson.com'
   -> Searching: 'Ericsson number of employees'
   -> Searching: 'Vad √§r Ericsson AB'
   -> Searching: 'Ericsson AB bransch'
   -> Searching: 'Ericsson business areas and solutions'
   -> Searching: 'Ericsson leadership team site:ericsson.com'
   -> Searching: 'Ericsson annual report number of employees'
   -> Searching: 'Ericsson major customers OR partners press release'
   -> Searching: 'Ericsson Wikipedia'
   -> Searching: 'Ericsson "about us" company profile'
   -

<IPython.core.display.JSON object>

‚úÖ Upserted Company: Telefonaktiebolaget LM Ericsson (556016-0680)
   Processing 7 shareholders...
üîé Executing 20 out of 16 search queries...
   -> Searching: 'Investor AB "our companies" approach'
   -> Searching: 'Investor AB investment strategy and criteria'
   -> Searching: 'Investor AB portfolio sectors'
   -> Searching: 'Investor AB about us description'
   -> Searching: 'Investor AB contact information'
   -> Searching: 'Investor AB'
   -> Searching: 'Investor AB management team'
   -> Searching: 'Investor AB investment thesis'
   -> Searching: 'Investor AB official website'
   -> Searching: 'Investor AB Crunchbase profile'
   -> Searching: 'Investor AB key executives linkedin'
   -> Searching: 'Investor AB board of directors'
   -> Searching: 'what industries does Investor AB invest in'
   -> Searching: 'Investor AB investment focus areas'
   -> Searching: 'Investor AB company overview'
   -> Searching: 'Investor AB homepage'
      üìâ No official Swedish ID found for Inve

In [11]:
org_id = structured_data.get('organization_id')
print(f"Extracted Organization ID: {org_id}")
print("1Ô∏è‚É£  Authenticating...")
# Call the Bolagsverket API to get more informaiton on the company 
token = get_access_token()
if token:
    print("‚úÖ Access Token received!")
    
    # Example: Search for Bolagsverket's own org number (202100-5489)
    # Remove hyphen for the API: 2021005489
    test_org_number = 9697802230
    
    print(f"2Ô∏è‚É£  Searching for company: {test_org_number}...")
    boglagsverket_api_data = search_company(test_org_number, token)
    
    if boglagsverket_api_data:
        print("‚úÖ Data received:")
        print(boglagsverket_api_data)
else:
    print("üõë Could not proceed without token.")

NameError: name 'structured_data' is not defined