Get relevant Vainu data from API and save the raw JSON file into lakehouse

In [None]:
from pyspark.sql import SparkSession
import time
import requests
import json
from azure.keyvault.secrets import SecretClient
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("VainuAPI").getOrCreate()

# Microsoft Fabric Configuration
key_vault_url = 'your-keyvault-url'  # Replace with your Key Vault URL
secret_name = 'your-secret-name'  # Replace with your secret name

API_KEY = notebookutils.credentials.getSecret(key_vault_url, secret_name)

# Vainu API Endpoint
API_URL = "https://api.vainu.io/api/v2/companies/"


headers = {
    "API-Key": API_KEY,
    "Content-Type": "application/json"
}


# Query variables
cities = ["helsinki", "tampere", "vantaa", "espoo", "oulu", "turku", "lahti", "jyväskylä"]
queryfields = "company_name,business_id,city,staff_number,staff_number_estimate,career_page_job_count,description,foundation_date,lat,lng,status,vainu_custom_industry,official_industries,organization_size_indicators"
offset = 0              # beginning point of query
limit = 1000            # max limit
staff_number_gt = 9     # staff number more than
turn_over_gte = 500000  # turnover more than
all_results = []


# Loop to go through all companies
while True:
    params = {
        "country": "FI",
        "city": cities,
        "fields": queryfields,
        "limit": limit,
        "staff_number__gt": staff_number_gt,
        "turn_over__gte": turn_over_gte,
        "offset": offset
    }
    time.sleep(1)   
    response = requests.get(API_URL, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])

        if not results:
            print("No more data to fetch. Stopping...")
            break

        all_results.extend(results)

        print(f"Fetched {len(results)} companies, total so far: {len(all_results)}")

        if len(results) < limit:
            print("Reached the last page. Stopping...")
            break

        offset += limit
    else:
        print("Error:", response.status_code, response.text)
        break

# Save to JSON file
json_string = json.dumps(all_results, indent=4)
json_file_path = "your-json-file.json"  # Replace with your desired file name
mssparkutils.fs.put(f"your lakehouse path{json_file_path}", json_string, True)
print("Data saved to lakehouse")
    

StatementMeta(, 24136ad8-6b10-4a99-9846-8ee78b19a7d5, 3, Finished, Available, Finished)

Fetched 1000 companies, total so far: 1000
Fetched 1000 companies, total so far: 2000
Fetched 1000 companies, total so far: 3000
Fetched 1000 companies, total so far: 4000
Fetched 1000 companies, total so far: 5000
Fetched 1000 companies, total so far: 6000
Fetched 1000 companies, total so far: 7000
Fetched 1000 companies, total so far: 8000
Fetched 563 companies, total so far: 8563
Reached the last page. Stopping...
Data saved to lakehouse


In [2]:
print(response.headers)
response = requests.head("https://api.vainu.io/api/v2/companies/")
print(response.status_code)

StatementMeta(, 24136ad8-6b10-4a99-9846-8ee78b19a7d5, 4, Finished, Available, Finished)

{'Allow': 'GET, HEAD, OPTIONS', 'Content-Encoding': 'gzip', 'Content-Language': 'en', 'Content-Type': 'application/json', 'Date': 'Tue, 25 Mar 2025 08:59:10 GMT', 'Vary': 'Accept-Encoding, Accept, Accept-Language, Origin', 'transfer-encoding': 'chunked', 'Connection': 'keep-alive'}
401


Financial data separately due to response overload

In [None]:
from pyspark.sql import SparkSession
import time
import requests
import json
from azure.keyvault.secrets import SecretClient
import pandas as pd

#spark.conf.set("spark.network.timeout", "10000s")  # Increase Spark network timeout

# Initialize Spark session
spark = SparkSession.builder.appName("VainuAPI").getOrCreate()

# Microsoft Fabric Configuration
key_vault_url = 'your-keyvault-url'  # Replace with your Key Vault URL
secret_name = 'your-secret-name'  # Replace with your secret name

API_KEY = notebookutils.credentials.getSecret(key_vault_url, secret_name)

# Vainu API Endpoint
API_URL = "https://api.vainu.io/api/v2/companies/"


# Query variables for fina
cities = ["helsinki", "tampere", "vantaa", "espoo", "oulu", "turku", "lahti", "jyväskylä"]
queryfields_fina = "business_id,total_funding_usd,financial_statements.year,financial_statements.turn_over_eur,financial_statements.profit,financial_statements.employee_salary_local,financial_statements.net_income_local"
offset = 0              # beginning point of query
limit = 1000            # max limit
staff_number_gt = 9     # staff number more than
turn_over_gte = 500000  # turnover more than
all_results = []
i = 1


headers = {
    "API-Key": API_KEY,
    "Content-Type": "application/json"
}


# Financial data GET loop
while True:

    params_fina = {
    "country": "FI",
    "city": cities,
    "fields": queryfields_fina,
    "limit": limit,
    "staff_number__gt": staff_number_gt,
    "turn_over__gte": turn_over_gte,
    "offset": offset
    }


    time.sleep(1)   
    start_time = time.time()
    response = requests.get(API_URL, headers=headers, params=params_fina)
    end_time = time.time()
    duration = end_time - start_time
    print(f"API request {i} took {duration:.2f} seconds")
    i += 1

    if response.status_code == 200:
        data = response.json()
        results = data.get("results", [])

        if not results:
            print("No more data to fetch. Stopping...")
            break

        all_results.extend(results)

        print(f"Fetched {len(results)} companies, total so far: {len(all_results)}")

        if len(results) < limit:
            print("Reached the last page. Stopping...")
            break

        offset += limit
    else:
        print("Error:", response.status_code, response.text)
        break

# Save to JSON file
json_string = json.dumps(all_results, indent=4)
json_file_path = "your-json-file.json"  # Replace with your desired file name
mssparkutils.fs.put(f"your lakehouse path/{json_file_path}", json_string, True)
print("Data saved to lakehouse")


StatementMeta(, 24136ad8-6b10-4a99-9846-8ee78b19a7d5, 5, Finished, Available, Finished)

API request 1 took 0.87 seconds
Fetched 1000 companies, total so far: 1000
API request 2 took 1.92 seconds
Fetched 1000 companies, total so far: 2000
API request 3 took 4.47 seconds
Fetched 1000 companies, total so far: 3000
API request 4 took 7.23 seconds
Fetched 1000 companies, total so far: 4000
API request 5 took 12.17 seconds
Fetched 1000 companies, total so far: 5000
API request 6 took 15.36 seconds
Fetched 1000 companies, total so far: 6000
API request 7 took 23.65 seconds
Fetched 1000 companies, total so far: 7000
API request 8 took 19.75 seconds
Fetched 1000 companies, total so far: 8000
API request 9 took 24.36 seconds
Fetched 562 companies, total so far: 8562
Reached the last page. Stopping...
Data saved to lakehouse
