In [1]:
import os
import json
import google.generativeai as genai
from tavily import TavilyClient
from dotenv import load_dotenv
from IPython.display import display, Markdown, JSON
import google.generativeai as genai
import urllib.parse
import json
import os
import requests
import base64
from dotenv import load_dotenv
import os
import sys
from dotenv import load_dotenv



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()

# --- CONFIGURATION ---
# Replace these with your actual keys or ensure they are in your .env file
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

if not GEMINI_API_KEY or not TAVILY_API_KEY:
    print("‚ö†Ô∏è WARNING: Please set GEMINI_API_KEY and TAVILY_API_KEY in your environment or this cell.")

# Configure Clients
genai.configure(api_key=GEMINI_API_KEY)
#TODO experiment with different models
model = genai.GenerativeModel('gemini-2.5-pro') 
tavily = TavilyClient(api_key=TAVILY_API_KEY)

In [3]:
### --- BOLAGSVERKET AUTHENTICATION & API CAllS ---
## Not needed atm 

# Load environment variables from a .env file
load_dotenv()

CLIENT_ID = os.getenv("BOLAGSVERKET_CLIENT_ID", "YOUR_CLIENT_ID")
CLIENT_SECRET = os.getenv("BOLAGSVERKET_CLIENT_SECRET", "YOUR_CLIENT_SECRET")

# Endpoints documented by Bolagsverket
TOKEN_URL = "https://portal.api.bolagsverket.se/oauth2/token"
# Base URL found in documentation for V√§rdefulla datam√§ngder
API_BASE_URL = "https://gw.api.bolagsverket.se/vardefulla-datamangder/v1"

def get_access_token():
    """Authenticates with Bolagsverket and returns an access token."""
    
    # Encode client_id:client_secret in base64 for Basic Auth header
    creds = f"{CLIENT_ID}:{CLIENT_SECRET}"
    creds_b64 = base64.b64encode(creds.encode("utf-8")).decode("utf-8")

    headers = {
        "Authorization": f"Basic {creds_b64}",
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    data = {
        "grant_type": "client_credentials",
        # needed for read / ping access 
        "scope": "vardefulla-datamangder:read vardefulla-datamangder:ping" 
    }

    try:
        response = requests.post(TOKEN_URL, headers=headers, data=data)
        response.raise_for_status()
        token_data = response.json()
        return token_data["access_token"]
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching token: {e}")
        if response.content:
            print(f"Details: {response.content}")
        return None


def search_company(org_number, token):
    """
    Fetches company information using the provided organization number and access token.
    Args:
        org_number (str or int): The 10-digit tax number of the organization. If the input 
            is not 10 digits, it will be zero-padded to ensure the correct format.
        token (str): The access token required for authentication.
    Returns:
        dict: A dictionary containing the company information retrieved from the API, 
            if the request is successful.
        None: If the request fails, returns None and logs the error details.
    Notes:
        - The organization number should be provided without any dashes (e.g., "5560160680").
        - Ensure that the `API_BASE_URL` variable is defined and points to the correct API endpoint.
        - The function prints detailed error messages to help debug issues with the API response.
    """
    
    url = f"{API_BASE_URL}/organisationer"
    
    # ---------------------------------------------------------
    # Use tax id of comapny -> without - 
    # ---------------------------------------------------------
    payload = {
        "identitetsbeteckning": org_number
    }

    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching data: {e}")
        # Print detailed error to debug schema issues
        print(f"Response: {response.text}")
        return None
    


In [4]:
def generate_queries(company_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries based on the entity name.
    """
    input_data = f"""
    <user>
        <input>
            <company_info>
                <name>{company_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
            </company_info>
        </input>
    </user>
    """
    
    with open("../app/services/prompts/query_internet.xml", "r") as file:
        QUERY_INTERNET_PROMPT = file.read()
    full_prompt = QUERY_INTERNET_PROMPT + input_data
    
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        data = json.loads(response.text)
        # Extract just the query strings from the structured objects
        queries = [q["query"] for q in data.get("queries", [])]
        # Deduplicate
        return list(set(queries))
    except Exception as e:
        print(f"Error parsing Gemini response: {e}")
        return [f"{company_name} official website", f"{company_name} Sweden annual report"]

def perform_search(queries: list[str], query_number: int = 20):
    """
    Step 2: Execute search using Tavily API.
    """
    aggregated_context = []
    
    print(f"üîé Executing {query_number} out of {len(queries)} search queries...")
    
    # To save tokens/API credits, we might limit queries here
    for query in queries[:query_number]: # Limit to top k queries for this demo
        try:
            print(f"   -> Searching: '{query}'")
            # Tavily 'search' returns structured results with content
            response = tavily.search(query=query, search_depth="advanced", max_results=5)
            
            for result in response.get("results", []):
                aggregated_context.append(f"Source: {result['url']}\nContent: {result['content']}\n---")
        except Exception as e:
            print(f"   x Error searching '{query}': {e}")
            
    return "\n".join(aggregated_context)

def structure_data(company_name: str, search_context: str):
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    """
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context}
                </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
            </entity_context>
        </input>
    </user>
    """
    
    with open("../app/services/prompts/structure_data.xml", "r") as file:
        STRUCTURE_DATA_PROMPT = file.read()
    full_prompt = STRUCTURE_DATA_PROMPT + input_data
    #print(full_prompt)
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        return json.loads(response.text)
    except Exception as e:
        print(f"Error structuring data: {e}")
        return {"error": response.text}
    


def generate_queries_funds(fund_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries for a Fund/Investor.
    Uses the 'query_investor_web.xml' prompt.
    """
    # 1. Construct the Input XML strictly matching the prompt's expected structure
    input_data = f"""
    <user>
        <input>
            <investor_info>
                <name>{fund_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
                <country_code>SE</country_code>
            </investor_info>
            
            <missing_fields>
                <field>description</field>
                <field>investment_thesis</field>
                <field>sectors</field>
                <field>website</field>
                <field>key_people</field>
            </missing_fields>
        </input>
    </user>
    """
    
    # 2. Load the specific Investor Prompt
    # Note: Adjust path if running from a different directory (e.g. '../src/app/...' from notebooks)
    prompt_path = "../app/services/prompts/query_internet_funds.xml"
    
    try:
        with open(prompt_path, "r") as file:
            QUERY_INVESTOR_PROMPT = file.read()
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Warning: Prompt file not found at {prompt_path}. Check your path.")
        return [f"{fund_name} investment thesis", f"{fund_name} portfolio sectors"]

    full_prompt = QUERY_INVESTOR_PROMPT + input_data
    
    # 3. Call Gemini
    try:
        response = model.generate_content(
            full_prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        
        # 4. Parse Response
        data = json.loads(response.text)
        
        # Extract just the query strings
        queries = [q["query"] for q in data.get("queries", [])]
        
        # Deduplicate and return
        return list(set(queries))
        
    except Exception as e:
        print(f"Error generating/parsing fund queries: {e}")
        # Fallback queries if LLM fails
        return [
            f"{fund_name} investment thesis", 
            f"{fund_name} official website", 
            f"{fund_name} portfolio"
        ]
        
    

In [5]:
def structure_data_fund(company_name: str, search_context: str, org_id: str = None) -> dict:
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    Uses 'structure_data.xml' to extract fields like description, mission, key_people, etc.
    """
    # 1. Define path to the generic company prompt
    prompt_path = "../app/services/prompts/structure_data_fund.xml"
    
    # 2. Construct Input XML (matching the prompt's expected <user><input>... structure)
    # We provide basic Bolagsverket info if we have it, otherwise just the name/ID context.
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Organization Number: {org_id or "Unknown"}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context[:20000]} </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
                <known_org_number>{org_id or ""}</known_org_number>
            </entity_context>
        </input>
    </user>
    """
    
    try:
        # 3. Load the Prompt File
        with open(prompt_path, "r") as file:
            STRUCTURE_DATA_PROMPT = file.read()
            
        # 4. Combine and Generate
        full_prompt = STRUCTURE_DATA_PROMPT + input_data
        
        response = model.generate_content(
            full_prompt, 
            generation_config={"response_mime_type": "application/json"}
        )
        
        return json.loads(response.text)

    except FileNotFoundError:
        print(f"‚ùå Error: Prompt file not found at {prompt_path}")
        return {}
    except Exception as e:
        print(f"‚ùå Error structuring data for {company_name}: {e}")
        return {}

In [6]:
def run_pipeline(company_name: str):
    """
    Runs the data pipeline for the given company name.
    """
    print(f"Starting Scraper for: {company_name}\n")

    # Generate Queries
    queries = generate_queries(company_name)
    print("‚úÖ Generated Queries:")
    print(json.dumps(queries[:2], indent=2))

    # Scrape Web
    if queries:
        search_context = perform_search(queries, query_number=20)
        print(f"\n‚úÖ Retrieved {len(search_context)} characters of context.")
    else:
        search_context = ""
        print("‚ùå No queries generated.")

    # Structure Data
    if search_context:
        print("\nüß† Structuring data with Gemini...")
        structured_data = structure_data(company_name, search_context)
        
        print("\n‚ú® FINAL JSON OUTPUT:")
        display(JSON(structured_data))
    else:
        print("‚ùå Skipping structuring due to lack of context.")
    return structured_data


In [None]:
import sys
import os
import uuid
import json
from typing import Dict, Any, List

# 1. Setup path to allow imports from 'src'
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from app.db.neo4j_client import get_driver
# Import the query functions
from app.db.queries.company_queries import upsert_company
from app.db.queries.investor_queries import upsert_investor
from app.db.queries.relationship_queries import add_ownership

# =============================================================================
# Main Ingestion Logic
# =============================================================================

def ingest_company_full(data: Dict[str, Any]) -> None:
    """
    Ingests a company, its portfolio items (as Investors/Funds), 
    and links them using the add_ownership method.
    """
    
    # -------------------------------------------------------
    # 1. Prepare Main Company Data
    # -------------------------------------------------------
    company_id = data.get("organization_id")
    company_id = company_id.replace("-", "")
    if not company_id:
        print(f"Skipping {data.get('name')}: No organization_id")
        return

    company_data = {
        "company_id": company_id,
        "name": data.get("name"),
        "country_code": data.get("country_code", "SE"),
        "description": data.get("description", ""),
        "mission": data.get("mission", ""),
        "year_founded": data.get("year_founded"),
        "num_employees": data.get("num_employees"),
        "website": data.get("website", ""),
        "sectors": data.get("sectors", []),
        "aliases": data.get("aliases", []),
        "key_people": data.get("key_people", []),
    }

    # -------------------------------------------------------
    # 2. Upsert Main Company
    # -------------------------------------------------------
    try:
        upsert_company(company_data)
        print(f"‚úÖ Upserted Company: {company_data['name']}")
    except Exception as e:
        print(f"‚ùå Error upserting company: {e}")
        return

    # -------------------------------------------------------
    # 4. Process Shareholders (Shareholder OWNS Company)
    # -------------------------------------------------------
    shareholder_items = data.get("shareholders", [])
    if shareholder_items:
        print(f"   Processing {len(shareholder_items)} shareholders...")
        for item in shareholder_items:
                process_related_entity(item, main_company_id=company_id, relationship="shareholder") 


def process_related_entity(item: Dict[str, Any], main_company_id: str, relationship: str):
    """
    Helper to upsert a related entity (Fund/Investor) and link it.
    Updates the entity_id if an official Organization ID is found during the search.
    """
    name = item.get("name")
    if not name:
        return

    # A. Initial Setup
    # Start with provided ID or a temporary UUID
    initial_id = item.get("entity_id").replace("-", "")
    
    investor_data = {
        "company_id": initial_id,
        "name": name,
        "country_code": "SE",
        "description": f"Ingested as {relationship} of {main_company_id}",
        "sectors": [],
        "website": "",
        "key_people": [],
        "investment_thesis": ""
    }

    # B. Agentic Enrichment Loop
    try:
        # 1. Generate Queries
        queries = generate_queries_funds(name)
        # print(f"      Generated {len(queries)} queries for {relationship} '{name}'")
        
        # 2. Perform Search
        if queries:
            raw_results = perform_search(queries, query_number=7)
            
            # 3. Structure Data using the NEW method (structure_data_fund)
            if raw_results:
                enriched_info = structure_data_fund(raw_results, name, initial_id)
                
                if enriched_info:
                    # Filter empty values
                    clean_enriched = {k: v for k, v in enriched_info.items() if v}
                    investor_data.update(clean_enriched)
                    
                    # CRITICAL: If search found an organization_id, use it as the definitive ID
                    found_org_id = clean_enriched.get("organization_id").replace("-", "")
                    print(f"      üîç Found organization ID for {name}: {found_org_id}")
                    if found_org_id:
                        investor_data["company_id"] = found_org_id
                        
                        # Preserve original name as alias if different
                        if "aliases" not in investor_data:
                            investor_data["aliases"] = []
                        if name != investor_data["name"] and name not in investor_data["aliases"]:
                            investor_data["aliases"].append(name)
                        
                        #print(f"      üéØ Resolved Identity: '{name}' -> Org ID: {final_entity_id}")
                    elif not found_org_id:
                        # If no official ID found, generate a UUID based on name
                        investor_data["company_id"] = str(uuid.uuid5(uuid.NAMESPACE_DNS, name)).replace("-", "")
                print(f"      ‚ú® Enriched '{investor_data.get('name')}': {len(investor_data.get('sectors', []))} sectors")

    except Exception as e:
        print(f"      ‚ö†Ô∏è Enrichment pipeline error for {name}: {e}")

    # C. Upsert & Link using the FINAL ID
    try:
        # 1. Upsert using the full schema helper
        upsert_investor(investor_data)
        # 2. Link based on relationship direction
        if relationship == "shareholder":
            # (Fund)-[:OWNS]->(Company)
            add_ownership(
                owner_id=investor_data["company_id"].replace("-", ""), 
                company_id=main_company_id,
                properties={"source": "llm_ingest"}
            )
            print(f"      ‚Ü≥ Linked Shareholder: {name}")

    except Exception as e:
        print(f"      ‚ùå Failed to DB sync '{name}': {e}")


In [9]:
### Run 
if __name__ == "__main__":
    famous_swedish_companies = [
        "Spotify AB",
        "IKEA",
        "Volvo Group",
        "H&M (Hennes & Mauritz)",
        "Ericsson",
        "Scania",
        "Electrolux"
    ]
    
    for company in famous_swedish_companies:
        ingest_company_full(run_pipeline(company))

Starting Scraper for: Spotify AB

‚úÖ Generated Queries:
[
  "Spotify institutional ownership",
  "Spotify AB f\u00f6retagsbeskrivning"
]
üîé Executing 20 out of 26 search queries...
   -> Searching: 'Spotify institutional ownership'
   -> Searching: 'Spotify AB f√∂retagsbeskrivning'
   -> Searching: 'Spotify AB business model'
   -> Searching: 'Spotify AB aliases brand names'
   -> Searching: 'Spotify official website'
   -> Searching: 'Spotify AB Wikipedia'
   -> Searching: 'Spotify AB bransch kategori'
   -> Searching: 'what industry is Spotify in'
   -> Searching: 'Spotify mission statement'
   -> Searching: 'Spotify AB styrelse och ledning'
   -> Searching: 'Spotify st√∂rsta √§gare'
   -> Searching: 'Spotify AB Crunchbase profile'
   -> Searching: 'Spotify leadership team'
   -> Searching: 'Spotify for Brands advertisers case studies'
   -> Searching: 'Spotify AB organisationsnummer'
   -> Searching: 'Spotify AB about us'
   -> Searching: 'Spotify AB company registration number S

<IPython.core.display.JSON object>

‚úÖ Upserted Company: Spotify AB
   Processing 7 shareholders...
üîé Executing 7 out of 16 search queries...
   -> Searching: 'Spotify Technology S.A. "about us" overview'
   -> Searching: 'what industries does spotify invest in'
   -> Searching: 'Spotify Ventures team partners LinkedIn'
   -> Searching: 'Spotify Technology S.A. investment criteria'
   -> Searching: 'Spotify Ventures investment thesis'
   -> Searching: 'Spotify corporate investments focus areas'
   -> Searching: 'Spotify Technology S.A. corporate M&A team'
      üîç Found organization ID for Spotify Technology S.A.: None
      ‚ú® Enriched 'Spotify Technology S.A.': 0 sectors
      ‚Ü≥ Linked Shareholder: Spotify Technology S.A.
üîé Executing 7 out of 16 search queries...
   -> Searching: 'Prima Materia portfolio focus areas'
   -> Searching: 'what kind of companies does Daniel Ek invest in'
   -> Searching: 'site:linkedin.com "Prima Materia" investment'
   -> Searching: 'Daniel Ek investor contact page'
   -> Searc

<IPython.core.display.JSON object>

‚úÖ Upserted Company: IKEA Svenska F√∂rs√§ljnings Aktiebolag
   Processing 2 shareholders...
üîé Executing 7 out of 16 search queries...
   -> Searching: 'Stichting INGKA Foundation leadership LinkedIn'
   -> Searching: 'Stichting INGKA Foundation IKEA relationship overview'
   -> Searching: 'Stichting INGKA Foundation focus areas funding pillars'
   -> Searching: 'Stichting INGKA Foundation official website'
   -> Searching: 'Stichting INGKA Foundation trustees executive committee'
   -> Searching: 'what is the Stichting INGKA Foundation'
   -> Searching: 'INGKA Foundation homepage'
      üîç Found organization ID for Stichting INGKA Foundation: None
      ‚ú® Enriched 'Stichting INGKA Foundation': 0 sectors
      ‚Ü≥ Linked Shareholder: Stichting INGKA Foundation
üîé Executing 7 out of 16 search queries...
   -> Searching: 'Ingka Investments portfolio sectors'
   -> Searching: 'Ingka Group corporate profile overview'
   -> Searching: 'Ingka Group venture arm industries'
   -> Sear

KeyboardInterrupt: 

In [None]:
org_id = structured_data.get('organization_id')
print(f"Extracted Organization ID: {org_id}")
print("1Ô∏è‚É£  Authenticating...")
# Call the Bolagsverket API to get more informaiton on the company 
token = get_access_token()
if token:
    print("‚úÖ Access Token received!")
    
    # Example: Search for Bolagsverket's own org number (202100-5489)
    # Remove hyphen for the API: 2021005489
    test_org_number = 9697802230
    
    print(f"2Ô∏è‚É£  Searching for company: {test_org_number}...")
    boglagsverket_api_data = search_company(test_org_number, token)
    
    if boglagsverket_api_data:
        print("‚úÖ Data received:")
        print(boglagsverket_api_data)
else:
    print("üõë Could not proceed without token.")

Extracted Organization ID: 556016-0680
1Ô∏è‚É£  Authenticating...
‚úÖ Access Token received!
2Ô∏è‚É£  Searching for company: 9697802230...
‚úÖ Data received:
{'organisationer': [{'avregistreradOrganisation': {'avregistreringsdatum': '2018-01-31', 'dataproducent': 'Bolagsverket', 'fel': None}, 'avregistreringsorsak': {'dataproducent': 'Bolagsverket', 'fel': None, 'klartext': 'Anm√§lan om att verksamheten har upph√∂rt', 'kod': 'VERKUPP'}, 'juridiskForm': {'dataproducent': 'SCB', 'fel': {'felBeskrivning': 'Den efterfr√•gade informationen gick inte att hitta.', 'typ': 'ORGANISATION_FINNS_EJ'}, 'klartext': None, 'kod': None}, 'namnskyddslopnummer': None, 'naringsgrenOrganisation': {'dataproducent': 'SCB', 'fel': {'felBeskrivning': 'Den efterfr√•gade informationen gick inte att hitta.', 'typ': 'ORGANISATION_FINNS_EJ'}, 'sni': []}, 'organisationsdatum': {'dataproducent': 'Bolagsverket', 'fel': None, 'infortHosScb': None, 'registreringsdatum': '2016-07-07'}, 'organisationsform': {'dataproducen