In [1]:
import os
import json
import google.generativeai as genai
from tavily import TavilyClient
from dotenv import load_dotenv
from IPython.display import display, Markdown, JSON
import google.generativeai as genai
import urllib.parse
import json
import os
import requests
import base64
from dotenv import load_dotenv
import os
import sys
from dotenv import load_dotenv



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()

# --- CONFIGURATION ---
# Replace these with your actual keys or ensure they are in your .env file
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

if not GEMINI_API_KEY or not TAVILY_API_KEY:
    print("‚ö†Ô∏è WARNING: Please set GEMINI_API_KEY and TAVILY_API_KEY in your environment or this cell.")

# Configure Clients
genai.configure(api_key=GEMINI_API_KEY)
#TODO experiment with different models
model = genai.GenerativeModel('gemini-2.5-pro') 
tavily = TavilyClient(api_key=TAVILY_API_KEY)

In [3]:
### --- BOLAGSVERKET AUTHENTICATION & API CAllS ---
## Not needed atm 

# Load environment variables from a .env file
load_dotenv()

CLIENT_ID = os.getenv("BOLAGSVERKET_CLIENT_ID", "YOUR_CLIENT_ID")
CLIENT_SECRET = os.getenv("BOLAGSVERKET_CLIENT_SECRET", "YOUR_CLIENT_SECRET")

# Endpoints documented by Bolagsverket
TOKEN_URL = "https://portal.api.bolagsverket.se/oauth2/token"
# Base URL found in documentation for V√§rdefulla datam√§ngder
API_BASE_URL = "https://gw.api.bolagsverket.se/vardefulla-datamangder/v1"

def get_access_token():
    """Authenticates with Bolagsverket and returns an access token."""
    
    # Encode client_id:client_secret in base64 for Basic Auth header
    creds = f"{CLIENT_ID}:{CLIENT_SECRET}"
    creds_b64 = base64.b64encode(creds.encode("utf-8")).decode("utf-8")

    headers = {
        "Authorization": f"Basic {creds_b64}",
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    data = {
        "grant_type": "client_credentials",
        # needed for read / ping access 
        "scope": "vardefulla-datamangder:read vardefulla-datamangder:ping" 
    }

    try:
        response = requests.post(TOKEN_URL, headers=headers, data=data)
        response.raise_for_status()
        token_data = response.json()
        return token_data["access_token"]
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching token: {e}")
        if response.content:
            print(f"Details: {response.content}")
        return None


def search_company(org_number, token):
    """
    Fetches company information using the provided organization number and access token.
    Args:
        org_number (str or int): The 10-digit tax number of the organization. If the input 
            is not 10 digits, it will be zero-padded to ensure the correct format.
        token (str): The access token required for authentication.
    Returns:
        dict: A dictionary containing the company information retrieved from the API, 
            if the request is successful.
        None: If the request fails, returns None and logs the error details.
    Notes:
        - The organization number should be provided without any dashes (e.g., "5560160680").
        - Ensure that the `API_BASE_URL` variable is defined and points to the correct API endpoint.
        - The function prints detailed error messages to help debug issues with the API response.
    """
    
    url = f"{API_BASE_URL}/organisationer"
    
    # ---------------------------------------------------------
    # Use tax id of comapny -> without
    # ---------------------------------------------------------
    payload = {
        "identitetsbeteckning": org_number
    }

    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching data: {e}")
        # Print detailed error to debug schema issues
        print(f"Response: {response.text}")
        return None
    


In [4]:
def generate_queries(company_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries based on the entity name.
    """
    input_data = f"""
    <user>
        <input>
            <company_info>
                <name>{company_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
            </company_info>
        </input>
    </user>
    """
    
    with open("../app/services/prompts/query_internet.xml", "r") as file:
        QUERY_INTERNET_PROMPT = file.read()
    full_prompt = QUERY_INTERNET_PROMPT + input_data
    
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        data = json.loads(response.text)
        # Extract just the query strings from the structured objects
        queries = [q["query"] for q in data.get("queries", [])]
        # Deduplicate
        return list(set(queries))
    except Exception as e:
        print(f"Error parsing Gemini response: {e}")
        return [f"{company_name} official website", f"{company_name} Sweden annual report"]

def perform_search(queries: list[str], query_number: int = 20):
    """
    Step 2: Execute search using Tavily API.
    """
    aggregated_context = []
    
    print(f"üîé Executing {query_number} out of {len(queries)} search queries...")
    
    # To save tokens/API credits, we might limit queries here
    for query in queries[:query_number]: # Limit to top k queries for this demo
        try:
            print(f"   -> Searching: '{query}'")
            # Tavily 'search' returns structured results with content
            response = tavily.search(query=query, search_depth="advanced", max_results=5)
            
            for result in response.get("results", []):
                aggregated_context.append(f"Source: {result['url']}\nContent: {result['content']}\n---")
        except Exception as e:
            print(f"   x Error searching '{query}': {e}")
            
    return "\n".join(aggregated_context)

def structure_data(company_name: str, search_context: str):
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    """
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context}
                </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
            </entity_context>
        </input>
    </user>
    """
    
    with open("../app/services/prompts/structure_data.xml", "r") as file:
        STRUCTURE_DATA_PROMPT = file.read()
    full_prompt = STRUCTURE_DATA_PROMPT + input_data
    #print(full_prompt)
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        return json.loads(response.text)
    except Exception as e:
        print(f"Error structuring data: {e}")
        return {"error": response.text}
    


def generate_queries_funds(fund_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries for a Fund/Investor.
    Uses the 'query_investor_web.xml' prompt.
    """
    # 1. Construct the Input XML strictly matching the prompt's expected structure
    input_data = f"""
    <user>
        <input>
            <investor_info>
                <name>{fund_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
                <country_code>SE</country_code>
            </investor_info>
            
            <missing_fields>
                <field>description</field>
                <field>investment_thesis</field>
                <field>sectors</field>
                <field>website</field>
                <field>key_people</field>
            </missing_fields>
        </input>
    </user>
    """
    
    # 2. Load the specific Investor Prompt
    # Note: Adjust path if running from a different directory (e.g. '../src/app/...' from notebooks)
    prompt_path = "../app/services/prompts/query_internet_funds.xml"
    
    try:
        with open(prompt_path, "r") as file:
            QUERY_INVESTOR_PROMPT = file.read()
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Warning: Prompt file not found at {prompt_path}. Check your path.")
        return [f"{fund_name} investment thesis", f"{fund_name} portfolio sectors"]

    full_prompt = QUERY_INVESTOR_PROMPT + input_data
    
    # 3. Call Gemini
    try:
        response = model.generate_content(
            full_prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        
        # 4. Parse Response
        data = json.loads(response.text)
        
        # Extract just the query strings
        queries = [q["query"] for q in data.get("queries", [])]
        
        # Deduplicate and return
        return list(set(queries))
        
    except Exception as e:
        print(f"Error generating/parsing fund queries: {e}")
        # Fallback queries if LLM fails
        return [
            f"{fund_name} investment thesis", 
            f"{fund_name} official website", 
            f"{fund_name} portfolio"
        ]
        
    

In [5]:
def structure_data_fund(company_name: str, search_context: str, org_id: str = None) -> dict:
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    Uses 'structure_data.xml' to extract fields like description, mission, key_people, etc.
    """
    # 1. Define path to the generic company prompt
    prompt_path = "../app/services/prompts/structure_data_fund.xml"
    
    # 2. Construct Input XML (matching the prompt's expected <user><input>... structure)
    # We provide basic Bolagsverket info if we have it, otherwise just the name/ID context.
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Organization Number: {org_id or "Unknown"}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context[:20000]} </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
                <known_org_number>{org_id or ""}</known_org_number>
            </entity_context>
        </input>
    </user>
    """
    
    try:
        # 3. Load the Prompt File
        with open(prompt_path, "r") as file:
            STRUCTURE_DATA_PROMPT = file.read()
            
        # 4. Combine and Generate
        full_prompt = STRUCTURE_DATA_PROMPT + input_data
        
        response = model.generate_content(
            full_prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        
        return json.loads(response.text)

    except FileNotFoundError:
        print(f"‚ùå Error: Prompt file not found at {prompt_path}")
        return {}
    except Exception as e:
        print(f"‚ùå Error structuring data for {company_name}: {e}")
        return {}

In [6]:
def run_pipeline(company_name: str):
    """
    Runs the data pipeline for the given company name.
    """
    print(f"Starting Scraper for: {company_name}\n")

    # Generate Queries
    queries = generate_queries(company_name)
    print("‚úÖ Generated Queries:")
    print(json.dumps(queries[:2], indent=2))

    # Scrape Web
    if queries:
        search_context = perform_search(queries, query_number=20)
        print(f"\n‚úÖ Retrieved {len(search_context)} characters of context.")
    else:
        search_context = ""
        print("‚ùå No queries generated.")

    # Structure Data
    if search_context:
        print("\nüß† Structuring data with Gemini...")
        structured_data = structure_data(company_name, search_context)
        
        print("\n‚ú® FINAL JSON OUTPUT:")
        display(JSON(structured_data))
    else:
        print("‚ùå Skipping structuring due to lack of context.")
    return structured_data


In [7]:
import re
import logging
import uuid
from typing import Any, Dict
# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.services.portfolio_ingestion import ingest_company_with_portfolio
from app.services.portfolio_ingestion import lookup_org_number_from_web
from app.db.queries.relationship_queries import add_ownership

import re
# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.db.neo4j_client import get_driver
# Import the query functions
from app.db.queries.company_queries import upsert_company
from app.db.queries.investor_queries import upsert_investor
from app.db.queries.relationship_queries import add_ownership

# =============================================================================
# Helper: ID Formatting (Safe for UUIDs)
# =============================================================================
def format_org_id(org_id: str) -> str:
    """
    Ensures Swedish organization numbers are formatted as xxxxxx-xxxx.
    SAFEGUARD: If the ID is a UUID (longer than 20 chars) or doesn't match 
    the 10-digit pattern, it returns the original value unchanged.
    """
    if not org_id:
        return org_id
    
    s_id = str(org_id)
    
    # 1. Safety Check: UUIDs are 36 chars. Swedish Org IDs are max 13 chars.
    if len(s_id) > 20:
        return s_id
    
    # 2. Clean: Remove non-digits to check raw length
    clean_id = re.sub(r'\D', '', s_id)
    
    # 3. Format: Only applying if strictly 10 digits
    if len(clean_id) == 10:
        return f"{clean_id[:6]}-{clean_id[6:]}"
    
    # Return original otherwise
    return s_id

# =============================================================================
# Main Ingestion Logic
# =============================================================================

def ingest_company_full(data: Dict[str, Any]) -> None:
    """
    Ingests a company and its shareholders (as Investors/Funds), 
    and links them using the add_ownership method.
    """
    
    # -------------------------------------------------------
    # 1. Prepare Main Company Data
    # -------------------------------------------------------
    raw_id = data.get("organization_id")
    if not raw_id:
        print(f"Skipping {data.get('name')}: No organization_id")
        return
        
    # Apply standard formatting (Safe for UUIDs)
    company_id = format_org_id(raw_id)

    company_data = {
        "company_id": company_id,
        "name": data.get("name"),
        "country_code": data.get("country_code", "SE"),
        "description": data.get("description", ""),
        "mission": data.get("mission", ""),
        "year_founded": data.get("year_founded"),
        "num_employees": data.get("num_employees"),
        "website": data.get("website", ""),
        "sectors": data.get("sectors", []),
        "aliases": data.get("aliases", []),
        "key_people": data.get("key_people", []),
    }

    # -------------------------------------------------------
    # 2. Upsert Main Company
    # -------------------------------------------------------
    try:
        upsert_company(company_data)
        print(f"‚úÖ Upserted Company: {company_data['name']} ({company_id})")
    except Exception as e:
        print(f"‚ùå Error upserting company: {e}")
        return

    # -------------------------------------------------------
    # 3. Process Shareholders (Shareholder OWNS Company)
    # -------------------------------------------------------
    shareholder_items = data.get("shareholders", [])
    if shareholder_items:
        print(f"   Processing {len(shareholder_items)} shareholders...")
        for item in shareholder_items:
            process_related_entity(item, target_company_id=company_id) 

def process_related_entity(
    item: Dict[str, Any], 
    target_company_id: str, 
    relationship: str = "shareholder"
):
    """
    Merged Logic Processor for Related Entities (Shareholders/Investors).
    """
    name = item.get("name")
    if not name:
        return None

    # Use print or logger consistently; referencing logger here for safety
    logger = logging.getLogger(__name__)
    
    # --- Step A: Initial Setup & Agentic Enrichment ---
    # (Assuming generate_queries_funds, perform_search, structure_data_fund are defined globally or imported)
    
    initial_id = item.get("entity_id")
    if initial_id:
        initial_id = format_org_id(initial_id)
        
    investor_data = {
        "company_id": initial_id,
        "name": name,
        "country_code": "SE",
        "description": f"Ingested as {relationship} of {target_company_id}",
        "sectors": [],
        "website": "",
        "key_people": [],
        "investment_thesis": ""
    }

    found_org_id = None

    # ... [Enrichment Logic remains the same] ...
    try:
        # Placeholder for your enrichment functions (ensure these are imported!)
        if 'generate_queries_funds' in globals():
            queries = generate_queries_funds(name)
            if queries:
                raw_results = perform_search(queries, 20)
                if raw_results:
                    enriched_info = structure_data_fund(raw_results, name, initial_id)
                    if enriched_info:
                        clean_enriched = {k: v for k, v in enriched_info.items() if v}
                        investor_data.update(clean_enriched)
                        detected_id = clean_enriched.get("organization_id")
                        if detected_id:
                            formatted_id = format_org_id(detected_id)
                            # Basic check for Swedish ID length (digits only)
                            clean_digits = re.sub(r'\D', '', formatted_id)
                            if len(clean_digits) == 10:
                                found_org_id = formatted_id
                                print(f"      üîç Agents found official Swedish ID for {name}: {found_org_id}")
    except Exception as e:
        print(f"      ‚ö†Ô∏è Enrichment pipeline error for {name}: {e}")

    # --- Step B: Ingestion Branching ---

    final_entity_id = None

    # PATH 1: Valid Swedish Org ID Found -> Full Viral Ingestion
    if found_org_id:
        # Set the final ID immediately, so we don't lose it if viral ingestion fails
        final_entity_id = found_org_id
        investor_data["company_id"] = found_org_id

        # 1. Try Viral Ingestion (BONUS STEP)
        try:
            print(f"      üöÄ Valid ID found ({found_org_id}). Attempting viral ingestion...")
            # We assume this function is imported correctly
            ingest_company_with_portfolio(found_org_id, investor_data["name"])
        except Exception as e:
            # FIX: If viral ingestion fails, LOG IT but DO NOT RETURN. Continue to link!
            logger.warning(f"      ‚ö†Ô∏è Viral ingestion failed for {name} (Non-fatal): {e}")

        # 2. Upsert the Investor Node (ESSENTIAL STEP)
        try:
            # Ensure enriched data (sectors, etc.) is saved, overwriting/merging with whatever ingest_company did
            upsert_investor(investor_data)
        except Exception as e:
            logger.error(f"      ‚ùå Critical: Failed to upsert investor node {name}: {e}")
            return None

    # PATH 2: No Official ID -> Simple Insert with UUID
    else:
        print(f"      üìâ No official Swedish ID found for {name}. Falling back to simple upsert.")
        
        if not investor_data.get("company_id"):
            investor_data["company_id"] = str(uuid.uuid5(uuid.NAMESPACE_DNS, name))
        
        final_entity_id = investor_data["company_id"]
        
        try:
            upsert_investor(investor_data)
        except Exception as e:
            logger.error(f"      ‚ùå Failed to simple upsert '{name}': {e}")
            return None

    # --- Step C: Linking ---
    # This will now run even if Viral Ingestion crashed
    if final_entity_id:
        try:
            props = {"source": "llm_ingest"}
            if "ownership_pct" in item and item["ownership_pct"]:
                 props["share_percentage"] = float(item["ownership_pct"])

            if relationship == "shareholder":
                add_ownership(
                    owner_id=final_entity_id, 
                    company_id=target_company_id,
                    properties=props
                )
                print(f"      ‚Ü≥ Linked Shareholder: {name} ({final_entity_id}) -> {target_company_id}")
            
            return final_entity_id

        except Exception as e:
            print(f"      ‚ùå Failed to link '{name}': {e}")
            return None

BeautifulSoup4 not available - Allabolag scraping will be limited


In [16]:
### Run 
if __name__ == "__main__":
    famous_swedish_companies = [
        #"Spotify AB",
        "IKEA",
        #"Volvo Group",
        "H&M (Hennes & Mauritz)",
        #"Ericsson",
        #"Scania",
        #"Electrolux"
    ]
    
    for company in famous_swedish_companies:
        ingest_company_full(run_pipeline(company))

Starting Scraper for: IKEA

‚úÖ Generated Queries:
[
  "IKEA bransch och verksamhetsomr\u00e5den",
  "IKEA Wikipedia company profile"
]
üîé Executing 20 out of 23 search queries...
   -> Searching: 'IKEA bransch och verksamhetsomr√•den'
   -> Searching: 'IKEA Wikipedia company profile'
   -> Searching: 'IKEA "our vision is to create a better everyday life"'
   -> Searching: 'IKEA CEO board of directors leadership'
   -> Searching: 'IKEA ownership structure Stichting INGKA Foundation'
   -> Searching: '"IKEA of Sweden AB" organisationsnummer'
   -> Searching: 'IKEA mission statement and vision'
   -> Searching: 'IKEA officiell webbplats Sverige'
   -> Searching: '"Inter IKEA Group" management team'
   -> Searching: 'IKEA holding company name "Inter IKEA Group"'
   -> Searching: 'IKEA number of employees worldwide'
   -> Searching: 'IKEA official website corporate information'
   -> Searching: 'about IKEA company description'
   -> Searching: 'vad √§r IKEA f√∂retagsprofil'
   -> Searchi

<IPython.core.display.JSON object>

‚úÖ Upserted Company: IKEA of Sweden AB (556074-7551)
   Processing 2 shareholders...
üîé Executing 20 out of 16 search queries...
   -> Searching: 'Stichting Ingka Foundation'
   -> Searching: 'Ingka Foundation key people LinkedIn'
   -> Searching: 'Stichting Ingka Foundation purpose overview'
   -> Searching: 'Stichting Ingka Foundation leadership team'
   -> Searching: 'Stichting Ingka Foundation portfolio focus areas'
   -> Searching: 'Ingka Foundation areas of interest grants'
   -> Searching: 'Stichting Ingka Foundation about us mission'
   -> Searching: 'Stichting Ingka Foundation investment approach'
   -> Searching: 'Ingka Foundation contact information'
   -> Searching: 'site:ingka.com "Ingka Foundation"'
   -> Searching: 'Ingka Foundation philanthropic funding criteria'
   -> Searching: 'Stichting Ingka Foundation board members'
   -> Searching: 'What is Stichting Ingka Foundation?'
   -> Searching: 'Stichting Ingka Foundation key partnerships sectors'
   -> Searching: 'Sti

<IPython.core.display.JSON object>

‚úÖ Upserted Company: H & M Hennes & Mauritz AB (556042-7220)
   Processing 5 shareholders...
üîé Executing 20 out of 19 search queries...
   -> Searching: '"Ramsbury Invest AB" investeringsstrategi'
   -> Searching: 'contact "Ramsbury Invest AB"'
   -> Searching: 'Ramsbury Invest AB LinkedIn profile overview'
   -> Searching: '"Ramsbury Invest AB" about us'
   -> Searching: 'who are the partners at "Ramsbury Invest AB" LinkedIn'
   -> Searching: '"Ramsbury Invest AB" team partners'
   -> Searching: 'Ramsbury Invest AB'
   -> Searching: '"Ramsbury Invest AB" bolagsinformation'
   -> Searching: 'Ramsbury Invest investment criteria ticket size'
   -> Searching: '"Ramsbury Invest AB" investment thesis'
   -> Searching: '"Ramsbury Invest AB" portfolio companies sectors'
   -> Searching: '"Ramsbury Invest AB" styrelse VD'
   -> Searching: 'Ramsbury Invest AB official website'
   -> Searching: '"Ramsbury Invest AB" allabolag.se'
   -> Searching: 'what sectors does Ramsbury Invest focus on'


Error inside hack_net worker for 556423-5769: unindent does not match any outer indentation level (hack_net.py, line 523)
Traceback (most recent call last):
  File "/Users/davis/VSCode/northern-lights/app/services/portfolio_ingestion.py", line 103, in _hack_net_worker
    import hack_net
  File "/Users/davis/VSCode/northern-lights/data_pipeline/illegal/hack_net.py", line 523
    else:
         ^
IndentationError: unindent does not match any outer indentation level
Error extracting portfolio from FI for 556423-5769: unindent does not match any outer indentation level (hack_net.py, line 523)
No portfolio data extracted for 556423-5769
Gemini returned non-dict JSON: <class 'list'>, converting to dict
  warn(
Skipping self-ownership: Stefan Persson (556423-5769)
Skipping self-ownership: Karl-Johan Persson (556423-5769)


      ‚Ü≥ Linked Shareholder: Ramsbury Invest AB (556423-5769) -> 556042-7220
üîé Executing 20 out of 19 search queries...
   -> Searching: 'Lottie Tham Dagens Industri investments focus'
   -> Searching: 'Lottie Tham investment criteria'
   -> Searching: 'Lottie Tham family office team'
   -> Searching: 'Lottie Tham angel investment strategy'
   -> Searching: 'Lottie Tham'
   -> Searching: 'Lottie Tham holdingbolag namn'
   -> Searching: 'Lottie Tham's investment company name'
   -> Searching: 'Lottie Tham family office'
   -> Searching: 'Lottie Tham H&M investments biography'
   -> Searching: 'Pieter Tham investments'
   -> Searching: 'what industries does Lottie Tham invest in'
   -> Searching: 'Lottie Tham investor profile'
   -> Searching: 'Vem √§r Lottie Tham investerare'
   -> Searching: 'Lottie Tham bolagsengagemang'
   -> Searching: 'Lottie Tham portfolio companies sectors'
   -> Searching: 'Lottie Tham holdingbolag organisationsnummer'
   -> Searching: 'Lottie Tham investmen

Error inside hack_net worker for 556549-2922: unindent does not match any outer indentation level (hack_net.py, line 523)
Traceback (most recent call last):
  File "/Users/davis/VSCode/northern-lights/app/services/portfolio_ingestion.py", line 103, in _hack_net_worker
    import hack_net
  File "/Users/davis/VSCode/northern-lights/data_pipeline/illegal/hack_net.py", line 523
    else:
         ^
IndentationError: unindent does not match any outer indentation level
Error extracting portfolio from FI for 556549-2922: unindent does not match any outer indentation level (hack_net.py, line 523)
No portfolio data extracted for 556549-2922
Skipping self-ownership: AMF Tj√§nstepension AB (556549-2922)
Skipping self-ownership: AMF (556549-2922)


      ‚Ü≥ Linked Shareholder: AMF Fonder AB (556549-2922) -> 556042-7220
üîé Executing 20 out of 16 search queries...
   -> Searching: 'what industries does Vanguard's private equity arm invest in'
   -> Searching: '"Vanguard" investment criteria for private markets'
   -> Searching: 'The Vanguard Group, Inc.'
   -> Searching: '"Vanguard Group" venture capital investment strategy'
   -> Searching: '"The Vanguard Group, Inc." LinkedIn company profile'
   -> Searching: '"The Vanguard Group, Inc." Crunchbase profile overview'
   -> Searching: '"Vanguard" growth equity focus areas'
   -> Searching: 'Vanguard corporate headquarters site'
   -> Searching: '"The Vanguard Group, Inc." leadership team'
   -> Searching: 'Vanguard institutional investor portal'
   -> Searching: '"Vanguard Group" private equity investment thesis'
   -> Searching: '"The Vanguard Group, Inc." official website'
   -> Searching: '"The Vanguard Group, Inc." about us summary'
   -> Searching: '"Vanguard Group" private 

In [None]:
import os
import re
import json
import networkx as nx
from typing import List, Dict, Any
from neo4j import GraphDatabase
import google.generativeai as genai
# ==========================================
# 1. HELPERS
# ==========================================
# TODO fix duplicated node issues 

def clean_name(name: str) -> str:
    """
    STRICT Cleaning (Case Insensitive):
    1. Converts to LOWERCASE.
    2. Removes ALL spaces, underscores, and special characters.
    
    Logic:
    - \W matches any character that is NOT alphanumeric (a-z, A-Z, 0-9).
    - _ matches the underscore (which \W usually includes as a "word" char, so we explicitly remove it).
    
    Examples:
    - "Lannebo Fonder"      -> "lannebofonder"
    - "LANNEBO_FONDER"      -> "lannebofonder"
    - "Lannebo-Fonder, AB"  -> "lannebofonderab"
    """
    if not name:
        return ""
    
    # regex [\W_] removes anything that is NOT a letter or number
    cleaned = re.sub(r'[\W_]+', '', str(name))
    
    return cleaned.lower()

def generate_golden_record(model, node_properties_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Uses Gemini to merge conflicting properties into one JSON object.
    """
    # 1. Sanitize inputs
    sanitized_props = []
    for prop in node_properties_list:
        p_copy = prop.copy()
        # Remove metadata not needed for merging context
        if 'vector' in p_copy: del p_copy['vector']
        if 'elementId' in p_copy: del p_copy['elementId']
        sanitized_props.append(p_copy)

    # 2. Fallback if model is not available
    if not model:
        print("Gemini model not initialized, using first record as fallback.")
        return sanitized_props[0]

    # 3. Prompt
    prompt = f"""
    You are a Data Reconciliation Expert. 
    I have {len(sanitized_props)} records representing the SAME entity.
    Your task: Merge them into one single "Golden Record" JSON object.
    
    Rules:
    1. Name: Pick the most official/capitalized version.
    2. Description: Combine if complementary.
    3. Sectors: Merge into a unique list.
    4. Return ONLY valid raw JSON. No Markdown.
    
    Input Records:
    {json.dumps(sanitized_props, indent=2, default=str)}
    """
    
    try:
        response = model.generate_content(prompt)
        text = response.text.strip()
        
        # Robust Markdown Cleaning
        if text.startswith("```"):
            lines = text.splitlines()
            if len(lines) >= 3:
                # Remove first (```json) and last (```) lines
                text = "\n".join(lines[1:-1])
            else:
                text = text.replace("```json", "").replace("```", "")
        
        return json.loads(text)
    except Exception as e:
        print(f"Warning: Gemini merge failed ({e}). Using first record as fallback.")
        return sanitized_props[0]

# ==========================================
# 2. CORE LOGIC
# ==========================================

def fetch_and_cluster_duplicates(driver) -> List[List[str]]:
    """
    Scans ALL nodes. Matches if:
    1. Organization IDs are identical.
    2. OR Names are identical (after strict, case-insensitive cleaning).
    """
    print("Fetching nodes from Neo4j (Global Search)...")
    
    # Matches ANY node that has a name OR a company_id
    query = """
    MATCH (n) 
    WHERE n.name IS NOT NULL OR n.company_id IS NOT NULL
    RETURN elementId(n) as node_id, n.company_id as org_id, n.name as name
    """
    
    with driver.session() as session:
        result = session.run(query)
        nodes = list(result)

    print(f"Analyzing {len(nodes)} nodes for duplicates...")
    
    G = nx.Graph()
    seen_org_ids = {} # org_id -> node_id
    seen_names = {}   # clean_name -> node_id

    for record in nodes:
        current_node_id = record["node_id"]
        org_id = record["org_id"]
        raw_name = record["name"]
        
        G.add_node(current_node_id)
        
        # A. ID Match
        if org_id and str(org_id).strip(): 
            if org_id in seen_org_ids:
                G.add_edge(current_node_id, seen_org_ids[org_id])
            else:
                seen_org_ids[org_id] = current_node_id

        # B. Strict Case-Insensitive Name Match
        if raw_name:
            c_name = clean_name(raw_name)
            if c_name: # Only process if name didn't reduce to empty string
                if c_name in seen_names:
                    G.add_edge(current_node_id, seen_names[c_name])
                else:
                    seen_names[c_name] = current_node_id

    # Find connected groups larger than 1
    clusters = [list(c) for c in nx.connected_components(G) if len(c) > 1]
    return clusters

def get_node_properties(driver, node_ids: List[str]) -> List[Dict]:
    query = "MATCH (n) WHERE elementId(n) IN $ids RETURN [n IN collect(n) | properties(n)] as props"
    with driver.session() as session:
        record = session.run(query, ids=node_ids).single()
        return record["props"] if record else []

def execute_merge(tx, node_ids: List[str], golden_record: Dict, master_vector: List[float]):
    """
    Merges nodes using APOC and re-points their edges.
    """
    # Note: apoc.refactor.mergeNodes requires a list of nodes, not IDs.
    # We fetch them first inside the transaction query.
    query = """
    MATCH (n) WHERE elementId(n) IN $node_ids
    WITH collect(n) as nodes
    CALL apoc.refactor.mergeNodes(nodes, {
        properties: 'discard', 
        mergeRels: true,
        produceSelfRef: false
    })
    YIELD node
    SET node += $golden_props
    """
    params = {"node_ids": node_ids, "golden_props": golden_record}
    
    # Only set vector if it exists and is not None
    if master_vector:
        query += " SET node.vector = $vector"
        params["vector"] = master_vector
    
    tx.run(query, params)

def force_merge_edges_nuclear(batch_size=1000):
    """
    The 'Nuclear' Edge Deduplication.
    Identifies parallel edges (Same Start, Same End, Same Type).
    Keeps the FIRST edge found, DELETES the rest.
    """
    driver = get_driver()
    print("Starting Nuclear Edge Cleanup...")

    try:
        while True:
            query = """
            MATCH (s)-[r]->(e)
            WITH s, e, type(r) AS t, collect(r) AS rels
            WHERE size(rels) > 1
            WITH rels LIMIT $batch_size
            
            // Keep head (first), delete tail (duplicates)
            UNWIND tail(rels) as duplicate_edge
            DELETE duplicate_edge
            RETURN count(duplicate_edge) as deleted_count
            """
            
            with driver.session() as session:
                result = session.run(query, batch_size=batch_size)
                record = result.single()
                if not record:
                    break
                    
                deleted = record["deleted_count"]
                print(f" -> Deleted {deleted} duplicate edges in this batch...")
                
                if deleted == 0:
                    break
    finally:
        print("Edge cleanup complete.")
        driver.close()

def process_smart_deduplication():
    driver = get_driver()
    
    try:
        # 1. Cluster duplicates based on ID or Cleaned Name
        clusters = fetch_and_cluster_duplicates(driver)
        
        if not clusters:
            print("No node duplicates found.")
        else:
            print(f"Found {len(clusters)} groups of node duplicates.")
            for i, node_ids in enumerate(clusters):
                print(f"Processing Group {i+1} / {len(clusters)} (Size: {len(node_ids)})...")
                try:
                    # A. Get data
                    props_list = get_node_properties(driver, node_ids)
                    
                    # B. Determine best Vector (Most recent updated_at)
                    # Use .get with default 0/empty to avoid errors if updated_at is missing
                    sorted_props = sorted(props_list, key=lambda x: x.get('updated_at', ''))
                    master_vector = sorted_props[-1].get('vector', None)
                    
                    # C. Generate Golden Record (Pass model explicitly)
                    golden_record = generate_golden_record(model, props_list)
                    
                    # D. Merge in DB
                    with driver.session() as session:
                        session.write_transaction(execute_merge, node_ids, golden_record, master_vector)
                except Exception as e:
                    print(f"Error merging group {node_ids}: {e}")
    finally:
        driver.close()

# ==========================================
# 3. EXECUTION
# ==========================================

if __name__ == "__main__":
    # 1. Merge Duplicate Nodes
    process_smart_deduplication()
    
    # 2. Cleanup Duplicate Edges (Nuclear Delete)
    force_merge_edges_nuclear()

  - \W matches any character that is NOT alphanumeric (a-z, A-Z, 0-9).
  with driver.session() as session:


Fetching nodes from Neo4j (Global Search)...
Analyzing 65 nodes for duplicates...
No node duplicates found.
Starting Nuclear Edge Cleanup...


  with driver.session() as session:


 -> Deleted 0 duplicate edges in this batch...
Edge cleanup complete.


In [29]:
import json
from neo4j import GraphDatabase

SEARCH_TERM = "AMF Fonder AB"

def find_ghost_node_safe():
    driver = get_driver()
    
    print(f"Scanning nodes for property value: '{SEARCH_TERM}'...")
    print("(This handles Lists/Arrays safely by searching inside Python)\n")
    
    # 1. Fetch ALL properties for nodes that might be relevant
    # We limit to nodes that actually have some properties to check
    query = """
    MATCH (n)
    WHERE keys(n) IS NOT NULL
    RETURN 
        elementId(n) as id, 
        labels(n) as labels, 
        properties(n) as all_props
    LIMIT 50000 
    """
    
    found_count = 0
    
    with driver.session() as session:
        result = session.run(query)
        
        # 2. Iterate in Python (Type-Safe)
        for record in result:
            props = record["all_props"]
            node_id = record["id"]
            labels = record["labels"]
            
            # Check if search term exists in ANY value of the dictionary
            # We convert values to str() here in Python, which handles Lists/Ints/None perfectly
            match_found = False
            matching_keys = []
            
            for key, value in props.items():
                if SEARCH_TERM in str(value):
                    match_found = True
                    matching_keys.append(key)
            
            if match_found:
                found_count += 1
                print(f"--- MATCH #{found_count} (ID: {node_id}) ---")
                print(f"Labels: {labels}")
                print(f"Match found in key(s): {matching_keys}")
                print("Properties:")
                print(json.dumps(props, indent=4, default=str))
                print("\n")

    print(f"Search complete. Found {found_count} nodes.")
    driver.close()

if __name__ == "__main__":
    find_ghost_node_safe()

Scanning nodes for property value: 'AMF Fonder AB'...
(This handles Lists/Arrays safely by searching inside Python)



  with driver.session() as session:


--- MATCH #1 (ID: 4:b6ec1d34-ccf6-4be1-b199-4893023a567d:44) ---
Labels: ['Fund']
Match found in key(s): ['name']
Properties:
{
    "country_code": "SE",
    "mission": "",
    "website": "",
    "sectors": [],
    "aliases": [
        "AMF Fonder",
        "AMF Pension & Fonder",
        "AMF Pension & Funds"
    ],
    "company_id": "556549-2922",
    "updated_at": "2025-12-06T15:43:47.223000000+00:00",
    "investment_thesis": "",
    "year_founded": "1997",
    "name": "AMF Fonder AB",
    "description": "Ingested as shareholder of 556042-7220",
    "key_people": []
}


Search complete. Found 1 nodes.


In [None]:
SEARCH_ID = "556789-1234" 

def find_node_by_id_safe():
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    
    print(f"Scanning nodes for ID value: '{SEARCH_ID}'...")
    
    # We fetch ALL nodes (limited) to ensure we don't miss any due to label filtering
    query = """
    MATCH (n)
    WHERE keys(n) IS NOT NULL
    RETURN 
        elementId(n) as id, 
        labels(n) as labels, 
        properties(n) as all_props
    LIMIT 50000 
    """
    
    found_count = 0
    target_str = str(SEARCH_ID).strip()
    
    with driver.session() as session:
        result = session.run(query)
        
        for record in result:
            props = record["all_props"]
            node_id = record["id"]
            labels = record["labels"]
            
            match_found = False
            matching_keys = []
            
            # Iterate over every property key to find the ID
            for key, value in props.items():
                # Convert DB value to string and strip whitespace for comparison
                # This handles cases where DB has 12345 (int) vs "12345" (str)
                if str(value).strip() == target_str:
                    match_found = True
                    matching_keys.append(key)
            
            if match_found:
                found_count += 1
                print(f"--- MATCH #{found_count} (Node ID: {node_id}) ---")
                print(f"Labels: {labels}")
                print(f"ID found in property key(s): {matching_keys}")
                print("Properties:")
                print(json.dumps(props, indent=4, default=str))
                print("\n")

    print(f"Search complete. Found {found_count} nodes.")
    driver.close()

if __name__ == "__main__":
    find_node_by_id_safe()

In [10]:
""" org_id = structured_data.get('organization_id')
print(f"Extracted Organization ID: {org_id}")
print("1Ô∏è‚É£  Authenticating...")
# Call the Bolagsverket API to get more informaiton on the company 
token = get_access_token()
if token:
    print("‚úÖ Access Token received!")
    
    # Example: Search for Bolagsverket's own org number (202100-5489)
    # Remove hyphen for the API: 2021005489
    test_org_number = 9697802230
    
    print(f"2Ô∏è‚É£  Searching for company: {test_org_number}...")
    boglagsverket_api_data = search_company(test_org_number, token)
    
    if boglagsverket_api_data:
        print("‚úÖ Data received:")
        print(boglagsverket_api_data)
else:
    print("üõë Could not proceed without token.") """

' org_id = structured_data.get(\'organization_id\')\nprint(f"Extracted Organization ID: {org_id}")\nprint("1Ô∏è‚É£  Authenticating...")\n# Call the Bolagsverket API to get more informaiton on the company \ntoken = get_access_token()\nif token:\n    print("‚úÖ Access Token received!")\n\n    # Example: Search for Bolagsverket\'s own org number (202100-5489)\n    # Remove hyphen for the API: 2021005489\n    test_org_number = 9697802230\n\n    print(f"2Ô∏è‚É£  Searching for company: {test_org_number}...")\n    boglagsverket_api_data = search_company(test_org_number, token)\n\n    if boglagsverket_api_data:\n        print("‚úÖ Data received:")\n        print(boglagsverket_api_data)\nelse:\n    print("üõë Could not proceed without token.") '