## OECD data on venture capital investments 

In [1]:
import requests
import pandas as pd
from io import StringIO



In [2]:
# Define API query URL (CSV with labels format)
url = "https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_VC@DF_VC_INV,1.0/...USD_EXC.A?startPeriod=2007&dimensionAtObservation=AllDimensions&format=csvfilewithlabels"

# Fetch data
response = requests.get(url)

# Load into pandas DataFrame
df = pd.read_csv(StringIO(response.text))

# Display first few rows
df.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,REF_AREA,Reference area,MEASURE,Measure,BUSINESS_DEVELOPMENT_STAGE,Business development stage,...,OBS_VALUE,Observation value,OBS_STATUS,Observation status,UNIT_MULT,Unit multiplier,CURRENCY,Currency,DECIMALS,Decimals
0,DATAFLOW,OECD.SDD.TPS:DSD_VC@DF_VC_INV(1.0),Venture capital investments (market statistics),I,BEL,Belgium,VC_INV_MKT,Venture capital investments (market statistics),SEED,Seed,...,0.0,,A,Normal value,6,Millions,USD,US dollar,2,Two
1,DATAFLOW,OECD.SDD.TPS:DSD_VC@DF_VC_INV(1.0),Venture capital investments (market statistics),I,CZE,Czechia,VC_INV_MKT,Venture capital investments (market statistics),SEED,Seed,...,0.0,,A,Normal value,6,Millions,USD,US dollar,2,Two
2,DATAFLOW,OECD.SDD.TPS:DSD_VC@DF_VC_INV(1.0),Venture capital investments (market statistics),I,CZE,Czechia,VC_INV_MKT,Venture capital investments (market statistics),SEED,Seed,...,0.0,,A,Normal value,6,Millions,USD,US dollar,2,Two
3,DATAFLOW,OECD.SDD.TPS:DSD_VC@DF_VC_INV(1.0),Venture capital investments (market statistics),I,CZE,Czechia,VC_INV_MKT,Venture capital investments (market statistics),SEED,Seed,...,0.0,,A,Normal value,6,Millions,USD,US dollar,2,Two
4,DATAFLOW,OECD.SDD.TPS:DSD_VC@DF_VC_INV(1.0),Venture capital investments (market statistics),I,CZE,Czechia,VC_INV_MKT,Venture capital investments (market statistics),SEED,Seed,...,0.0,,A,Normal value,6,Millions,USD,US dollar,2,Two


In [3]:
df.columns

Index(['STRUCTURE', 'STRUCTURE_ID', 'STRUCTURE_NAME', 'ACTION', 'REF_AREA',
       'Reference area', 'MEASURE', 'Measure', 'BUSINESS_DEVELOPMENT_STAGE',
       'Business development stage', 'UNIT_MEASURE', 'Unit of measure', 'FREQ',
       'Frequency of observation', 'TIME_PERIOD', 'Time period', 'OBS_VALUE',
       'Observation value', 'OBS_STATUS', 'Observation status', 'UNIT_MULT',
       'Unit multiplier', 'CURRENCY', 'Currency', 'DECIMALS', 'Decimals'],
      dtype='object')

In [4]:
columns_to_retain = ["REF_AREA", "Reference area", "Business development stage","TIME_PERIOD", "UNIT_MULT", "CURRENCY", "OBS_VALUE"]

df = df[columns_to_retain]

In [5]:
df.sort_values(by=['REF_AREA', 'TIME_PERIOD'])

Unnamed: 0,REF_AREA,Reference area,Business development stage,TIME_PERIOD,UNIT_MULT,CURRENCY,OBS_VALUE
184,AUS,Australia,Start-up and other early stage,2007,6,USD,84.513701
185,AUS,Australia,Later stage venture,2007,6,USD,566.492828
186,AUS,Australia,Total,2007,6,USD,680.293455
187,AUS,Australia,Seed,2007,6,USD,29.286926
326,AUS,Australia,Seed,2008,6,USD,36.907230
...,...,...,...,...,...,...,...
1549,ZAF,South Africa,Total,2019,6,USD,484.523132
1550,ZAF,South Africa,Total,2020,6,USD,109.346801
1551,ZAF,South Africa,Total,2021,6,USD,40.586963
1552,ZAF,South Africa,Total,2022,6,USD,67.211825


In [6]:
df["UNIT_MULT"].unique()

array([6])

In [7]:
df["CURRENCY"].unique()

array(['USD'], dtype=object)

!!! This means that all values are in millions of USD.

In [8]:
df["Reference area"].unique()

array(['Belgium', 'Czechia', 'Estonia', 'Greece', 'Hungary', 'Ireland',
       'Latvia', 'Lithuania', 'Luxembourg', 'Poland', 'Slovak Republic',
       'Slovenia', 'Bulgaria', 'Croatia', 'Romania', 'Portugal',
       'Australia', 'Israel', 'Canada', 'United States', 'Russia',
       'Austria', 'Denmark', 'Italy', 'Japan', 'Korea', 'New Zealand',
       'United Kingdom', 'Finland', 'Netherlands', 'Norway',
       'Switzerland', 'France', 'Germany', 'Spain', 'Sweden',
       'South Africa'], dtype=object)

!!! 37 different countries: 

- 'Belgium', 'Czechia', 'Estonia', 
- 'Greece', 'Hungary', 'Ireland',
- 'Latvia', 'Lithuania', 'Luxembourg', 
- 'Poland', 'Slovak Republic',
- 'Slovenia', 'Bulgaria', 'Croatia',
- 'Romania', 'Portugal', 'Australia',
- 'Israel', 'Canada', 'United States',
- 'Russia', 'Austria', 'Denmark',
- 'Italy', 'Japan', 'Korea',
- 'New Zealand', 'United Kingdom', 'Finland',
- 'Netherlands', 'Norway', 'Switzerland',
- 'France', 'Germany', 'Spain',
- 'Sweden', 'South Africa'

In [9]:
df["Business development stage"].unique()

array(['Seed', 'Later stage venture', 'Start-up and other early stage',
       'Total'], dtype=object)

The investemnts are differentiated into:
- Seed
- Start-up and other eraly stage
- Later stage venture
- Total

In [10]:
df["TIME_PERIOD"].unique()

array([2015, 2007, 2008, 2009, 2010, 2011, 2012, 2014, 2013, 2017, 2016,
       2019, 2018, 2020, 2024, 2023, 2021, 2022])

!!! The time preiod id from 2015 to 2024, but not all countries have data for all years

Drop unit multiplier and currency as it is always millions of USD

In [11]:
df.drop([ "UNIT_MULT", "CURRENCY"], axis = 1, inplace= True)

In [12]:
df.sort_values(by=['REF_AREA', 'TIME_PERIOD'], inplace=True)

In [13]:
df

Unnamed: 0,REF_AREA,Reference area,Business development stage,TIME_PERIOD,OBS_VALUE
184,AUS,Australia,Start-up and other early stage,2007,84.513701
185,AUS,Australia,Later stage venture,2007,566.492828
186,AUS,Australia,Total,2007,680.293455
187,AUS,Australia,Seed,2007,29.286926
326,AUS,Australia,Seed,2008,36.907230
...,...,...,...,...,...
1549,ZAF,South Africa,Total,2019,484.523132
1550,ZAF,South Africa,Total,2020,109.346801
1551,ZAF,South Africa,Total,2021,40.586963
1552,ZAF,South Africa,Total,2022,67.211825


In [15]:
df.to_csv("oecd_data.csv", index = False)