In [108]:
import os
import json
import google.generativeai as genai
from tavily import TavilyClient
from dotenv import load_dotenv
from IPython.display import display, Markdown, JSON
import google.generativeai as genai
import urllib.parse
import json
import os
import requests
import base64
from dotenv import load_dotenv



In [None]:
# Load environment variables
load_dotenv()

# --- CONFIGURATION ---
# Replace these with your actual keys or ensure they are in your .env file
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

if not GEMINI_API_KEY or not TAVILY_API_KEY:
    print("‚ö†Ô∏è WARNING: Please set GEMINI_API_KEY and TAVILY_API_KEY in your environment or this cell.")

# Configure Clients
genai.configure(api_key=GEMINI_API_KEY)
#TODO change model
model = genai.GenerativeModel('gemini-2.5-pro') 
tavily = TavilyClient(api_key=TAVILY_API_KEY)

In [110]:
### --- BOLAGSVERKET AUTHENTICATION & API CAllS ---

# Load environment variables from a .env file
load_dotenv()

CLIENT_ID = os.getenv("BOLAGSVERKET_CLIENT_ID", "YOUR_CLIENT_ID")
CLIENT_SECRET = os.getenv("BOLAGSVERKET_CLIENT_SECRET", "YOUR_CLIENT_SECRET")

# Endpoints documented by Bolagsverket
TOKEN_URL = "https://portal.api.bolagsverket.se/oauth2/token"
# Base URL found in documentation for V√§rdefulla datam√§ngder
API_BASE_URL = "https://gw.api.bolagsverket.se/vardefulla-datamangder/v1"

def get_access_token():
    """Authenticates with Bolagsverket and returns an access token."""
    
    # Encode client_id:client_secret in base64 for Basic Auth header
    creds = f"{CLIENT_ID}:{CLIENT_SECRET}"
    creds_b64 = base64.b64encode(creds.encode("utf-8")).decode("utf-8")

    headers = {
        "Authorization": f"Basic {creds_b64}",
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    data = {
        "grant_type": "client_credentials",
        # needed for read / ping access 
        "scope": "vardefulla-datamangder:read vardefulla-datamangder:ping" 
    }

    try:
        response = requests.post(TOKEN_URL, headers=headers, data=data)
        response.raise_for_status()
        token_data = response.json()
        return token_data["access_token"]
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching token: {e}")
        if response.content:
            print(f"Details: {response.content}")
        return None


def search_company(org_number, token):
    """
    Fetches company information using the provided organization number and access token.
    Args:
        org_number (str or int): The 10-digit tax number of the organization. If the input 
            is not 10 digits, it will be zero-padded to ensure the correct format.
        token (str): The access token required for authentication.
    Returns:
        dict: A dictionary containing the company information retrieved from the API, 
            if the request is successful.
        None: If the request fails, returns None and logs the error details.
    Notes:
        - The organization number should be provided without any dashes (e.g., "5560160680").
        - Ensure that the `API_BASE_URL` variable is defined and points to the correct API endpoint.
        - The function prints detailed error messages to help debug issues with the API response.
    """
    
    url = f"{API_BASE_URL}/organisationer"
    
    # ---------------------------------------------------------
    # Use tax id of comapny -> without - 
    # ---------------------------------------------------------
    payload = {
        "identitetsbeteckning": org_number
    }

    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
        "Accept": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Error fetching data: {e}")
        # Print detailed error to debug schema issues
        print(f"Response: {response.text}")
        return None
    


In [111]:
def generate_queries(company_name: str, org_id: str = None) -> list[str]:
    """
    Ask Gemini to generate optimized search queries based on the entity name.
    """
    input_data = f"""
    <user>
        <input>
            <company_info>
                <name>{company_name}</name>
                <company_id>{org_id or "Unknown"}</company_id>
            </company_info>
        </input>
    </user>
    """
    
    with open("../src/app/services/prompts/query_internet.xml", "r") as file:
        QUERY_INTERNET_PROMPT = file.read()
    full_prompt = QUERY_INTERNET_PROMPT + input_data
    
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        data = json.loads(response.text)
        # Extract just the query strings from the structured objects
        queries = [q["query"] for q in data.get("queries", [])]
        # Deduplicate
        return list(set(queries))
    except Exception as e:
        print(f"Error parsing Gemini response: {e}")
        return [f"{company_name} official website", f"{company_name} Sweden annual report"]

def perform_search(queries: list[str], max_results: int = 5):
    """
    Step 2: Execute search using Tavily API.
    """
    aggregated_context = []
    
    print(f"üîé Executing {len(queries)} search queries...")
    
    # To save tokens/API credits, we might limit queries here
    # TODO test values 
    for query in queries[:3]: # Limit to top 3 queries for this demo
        try:
            print(f"   -> Searching: '{query}'")
            # Tavily 'search' returns structured results with content
            response = tavily.search(query=query, search_depth="advanced", max_results=5)
            
            for result in response.get("results", []):
                aggregated_context.append(f"Source: {result['url']}\nContent: {result['content']}\n---")
        except Exception as e:
            print(f"   x Error searching '{query}': {e}")
            
    return "\n".join(aggregated_context)

def structure_data(company_name: str, search_context: str):
    """
    Step 3: Structure the gathered raw text into the Northern Lights JSON schema.
    """
    input_data = f"""
    <user>
        <input>
            <source_data>
                <bolagsverket>
                    Legal Name: {company_name}
                    Registered: Sweden
                </bolagsverket>
                <web_search>
                    {search_context}
                </web_search>
            </source_data>
            
            <entity_context>
                <entity_name>{company_name}</entity_name>
                <entity_type>company</entity_type>
            </entity_context>
        </input>
    </user>
    """
    
    with open("../src/app/services/prompts/structure_data.xml", "r") as file:
        STRUCTURE_DATA_PROMPT = file.read()
    full_prompt = STRUCTURE_DATA_PROMPT + input_data
    #print(full_prompt)
    response = model.generate_content(
        full_prompt, 
        generation_config={"response_mime_type": "application/json"}
    )
    
    try:
        return json.loads(response.text)
    except Exception as e:
        print(f"Error structuring data: {e}")
        return {"error": response.text}

In [None]:
# --- RUN THE PIPELINE ---

# 1. Define Target
COMPANY_NAME = "Ericsson"
print(f"Starting Scraper for: {COMPANY_NAME}\n")

# TODO add call to api and get org data 
# 2. Generate Queries
queries = generate_queries(COMPANY_NAME)
print("‚úÖ Generated Queries:")
print(json.dumps(queries[:2], indent=2))

# 3. Scrape Web
if queries:
    search_context = perform_search(queries)
    print(f"\n‚úÖ Retrieved {len(search_context)} characters of context.")
else:
    search_context = ""
    print("‚ùå No queries generated.")


# 4. Structure Data
if search_context:
    print("\nüß† Structuring data with Gemini...")
    structured_data = structure_data(COMPANY_NAME, search_context)
    
    print("\n‚ú® FINAL JSON OUTPUT:")
    display(JSON(structured_data))
else:
    print("‚ùå Skipping structuring due to lack of context.")
    


Starting Scraper for: Ericsson

‚úÖ Generated Queries:
[
  "Ericsson Wikipedia",
  "Ericsson.com"
]
üîé Executing 34 search queries...
   -> Searching: 'Ericsson Wikipedia'
   -> Searching: 'Ericsson.com'
   -> Searching: 'Ericsson alternative names abbreviations'

‚úÖ Retrieved 17678 characters of context.

üß† Structuring data with Gemini...

‚ú® FINAL JSON OUTPUT:


<IPython.core.display.JSON object>

In [113]:
org_id = structured_data.get('organization_id')
print(f"Extracted Organization ID: {org_id}")
print("1Ô∏è‚É£  Authenticating...")
# Call the Bolagsverket API to get more informaiton on the company 
token = get_access_token()
if token:
    print("‚úÖ Access Token received!")
    
    # Example: Search for Bolagsverket's own org number (202100-5489)
    # Remove hyphen for the API: 2021005489
    test_org_number = 9697802230
    
    print(f"2Ô∏è‚É£  Searching for company: {test_org_number}...")
    boglagsverket_api_data = search_company(test_org_number, token)
    
    if boglagsverket_api_data:
        print("‚úÖ Data received:")
        print(boglagsverket_api_data)
else:
    print("üõë Could not proceed without token.")

Extracted Organization ID: 556016-0680
1Ô∏è‚É£  Authenticating...
‚úÖ Access Token received!
2Ô∏è‚É£  Searching for company: 9697802230...
‚úÖ Data received:
{'organisationer': [{'avregistreradOrganisation': {'avregistreringsdatum': '2018-01-31', 'dataproducent': 'Bolagsverket', 'fel': None}, 'avregistreringsorsak': {'dataproducent': 'Bolagsverket', 'fel': None, 'klartext': 'Anm√§lan om att verksamheten har upph√∂rt', 'kod': 'VERKUPP'}, 'juridiskForm': {'dataproducent': 'SCB', 'fel': {'felBeskrivning': 'Den efterfr√•gade informationen gick inte att hitta.', 'typ': 'ORGANISATION_FINNS_EJ'}, 'klartext': None, 'kod': None}, 'namnskyddslopnummer': None, 'naringsgrenOrganisation': {'dataproducent': 'SCB', 'fel': {'felBeskrivning': 'Den efterfr√•gade informationen gick inte att hitta.', 'typ': 'ORGANISATION_FINNS_EJ'}, 'sni': []}, 'organisationsdatum': {'dataproducent': 'Bolagsverket', 'fel': None, 'infortHosScb': None, 'registreringsdatum': '2016-07-07'}, 'organisationsform': {'dataproducen