# Analytical Part

### Country Analysis

Goal: Compare startup distribution by country, funding by country and potentially industries by country.

In [1]:
import psycopg2
from dotenv import load_dotenv
import os
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

conn = psycopg2.connect(
    host=os.getenv("host"),
    port=os.getenv("port"),
    dbname=os.getenv("dbname"),
    user=os.getenv("user"),
    password=os.getenv("password"),
)

In [7]:
query = """
SELECT
    c.id,
    c.start_year,
    t.country_code,
    r.country
FROM
    company c
INNER JOIN
    city t ON c.city_id = t.city_id
INNER JOIN
    country r ON t.country_code = r.country_code;
    """

In [8]:
df_all_by_country = pd.read_sql(query, conn)

  df_all_by_country = pd.read_sql(query, conn)


In [9]:
df_all_by_country

Unnamed: 0,id,start_year,country_code,country
0,3972,1941.0,AUT,Austria
1,3706,2025.0,AUT,Austria
2,3707,2022.0,AUT,Austria
3,3708,2017.0,AUT,Austria
4,3709,2001.0,AUT,Austria
...,...,...,...,...
37183,41235,2020.0,GBR,United Kingdom
37184,41236,2019.0,GBR,United Kingdom
37185,41237,2020.0,GBR,United Kingdom
37186,41238,2022.0,GBR,United Kingdom


In [10]:
df_all_by_country["start_year"].min()

np.float64(1680.0)

In [14]:
# investigate which year has sizeable data to be the start year

year_counts = df_all_by_country['start_year'].value_counts().sort_index()

year_counts.tail(20)


start_year
2006.0     127
2007.0     181
2008.0     207
2009.0     213
2010.0     268
2011.0     324
2012.0     474
2013.0     576
2014.0    1459
2015.0    1708
2016.0    2538
2017.0    3071
2018.0    3649
2019.0    3424
2020.0    4628
2021.0    3395
2022.0    2103
2023.0    1636
2024.0    1387
2025.0     388
Name: count, dtype: int64

the first year will be 2014 the last 2024

In [15]:
df_country_year = df_all_by_country[
    (df_all_by_country['start_year'] >= 2014) & 
    (df_all_by_country['start_year'] <= 2024)
].copy()

The other visualization will be showing the investments per country and investment stage trends.

In [None]:
query = """
SELECT
    ci.id,
    ci.business_stage,
    ci.country_code,
    ci.year,
    ci.amount,
    c.country
FROM
    country_investments ci
INNER JOIN
    country c ON ci.country_code = c.country_code;
    """

In [7]:
df_investment = pd.read_sql(query, conn)

  df_investment = pd.read_sql(query, conn)


In [8]:
df_investment

Unnamed: 0,id,business_stage,country_code,year,amount,country
0,1,Later stage venture,AUT,2007,35.575349,Austria
1,2,Start-up and other early stage,AUT,2007,40.423894,Austria
2,3,Total,AUT,2007,85.421197,Austria
3,4,Seed,AUT,2007,9.421954,Austria
4,5,Seed,AUT,2008,7.758235,Austria
...,...,...,...,...,...,...
2007,2008,Start-up and other early stage,SWE,2023,211.772007,Sweden
2008,2009,Start-up and other early stage,SWE,2024,422.195261,Sweden
2009,2010,Seed,SWE,2024,79.697795,Sweden
2010,2011,Total,SWE,2024,652.608053,Sweden


In [11]:
print(df_investment["year"].min(),df_investment["year"].max())

2007 2024


In [12]:
df_investment["business_stage"].unique()

array(['Later stage venture', 'Start-up and other early stage', 'Total',
       'Seed'], dtype=object)