In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import altair as alt

In [3]:
# Scrape the OECD reports website for each report and put the html for each report page in a list. There is one report for each country.
oecd_pages = []

for i in list(range(20))[10:]:
    oecd_pages.append(requests.get('https://www.oecd.org/en/publications/the-space-economy-in-figures_fa5494aa-en/full-report/component-'+str(i)+'.html'))

In [4]:
# Beautiful Soup the html for each OECD report
oecd_html = []

for i in range(len(oecd_pages)):
    oecd_html.append(BeautifulSoup(oecd_pages[i].content , 'html.parser'))

In [5]:
# Get the text from each OECD report based on the divs that define the report tables I'm looking at
oecd_content = []

for i in range(len(oecd_html)):
    oecd_content.append(oecd_html[i].find_all('div' , class_ = 'cmp-title cmp-title--has-copy-icon')[0].get_text()) #This contains the country name
    oecd_content.append(oecd_html[i].find_all('div' , class_ = 'table embed-table')[0].get_text()) #This contains the table content for each page

In [6]:
# Define functions that use regular expressions to snip the particular piece of information I'm looking for from the scraped text

# Country function
def country(html):
    pattern = r'[0-9].\xa0(.*)\n'
    country = re.search(pattern, html)
    if country:
        return country.group(1)
    else:
        None

# First Satellite function
def first_sat(html):
    pattern = r'First satellite in orbit\n\n(.*) \('
    fs = re.search(pattern, html)
    if fs:
        return fs.group(1)
    else: 
        None

# First Satellite Launch year function
def first_sat_year(html):
    pattern = r'First satellite in orbit[\s\S]*?(\d{4})'
    fsy = re.search(pattern, html)
    if fsy:
        return fsy.group(1)
    else:
        None

# First Successful Orbital Launch function
def first_orbit(html):
    pattern = r'First successful orbital launch[\s\S]*?\((.*)\)'
    fo = re.search(pattern, html)
    if fo:
        return fo.group(1)
    else:
        None

# First Successful Orbital Launch Year function
def first_orbit_year(html):
    pattern = r'First successful orbital launch[\s\S]*?(\d{4})'
    foy = re.search(pattern, html)
    if foy:
        return foy.group(1)
    else:
        None

# Number of Satellites in Orbit function
def sats_in_orbit(html):
    pattern = r'Number of satellites in orbit[\s\S]*?\n(\d(?:\s*\d)*?)\n'
    sio = re.search(pattern, html)
    if sio:
        return pd.to_numeric(re.sub(r'\s+', '', sio.group(1)))
    else:
        return None

 # Number of Spaceports function   
def spaceports(html):
    pattern = r'Number of spaceports\n\n(\d)[^\d]*'
    sp = re.search(pattern, html)
    if sp:
        return pd.to_numeric(re.sub(r'[\s\xa0]', '', sp.group(1)))
    else:
        None

# Workforce number function
def workforce(html):
    pattern = r'Space-related workforce \(\d{4}\)\n\n(\d[\d\xa0\s]*\d)'
    wf = re.search(pattern, html)
    if wf:
        return pd.to_numeric(re.sub(r'[\s\xa0]', '', wf.group(1)))
    else:
        None

# Commerical Revenues from Space Activities function
def revenue(html):
    pattern = r'Space-related commercial revenues \(\d{4}\)\n\nUSD(.*?)[n]'
    rv = re.search(pattern, html)
    if rv:
        if re.search('b' , rv.group(1)) == None:
            return pd.to_numeric(re.sub(r'[\s\xa0million]' , '' , rv.group(1))) * 1000000
        else:
            return pd.to_numeric(re.sub(r'[\s\xa0billion]' , '' , rv.group(1))) * 1000000000
    else:
        None

# Space project budget share of GDP function
def budget_gdp(html):
    pattern = r'Institutional space budget as a share of gross domestic product\n\n(.*)\n\n\n\n'
    bgdp = re.search(pattern , html)
    if bgdp:
        return pd.to_numeric(re.sub(r'[\s\xa0\%]', '', bgdp.group(1)))
    else:
        None

# Space project budget per capita function
def budget_percapita(html):
    pattern = r'Institutional space budget per capita\n\n(.*)\n\n\n\n\n\n'
    bpc = re.search(pattern , html)
    if bpc:
        return pd.to_numeric(re.sub(r'[\s\xa0\%]', '', bpc.group(1)))
    else:
        None

In [7]:
# Run each parsing function on the on the OCED data
oecd_list = []

for i in range(20):
    if i /  2 == round(i / 2): #Checks if the list index is even (even indexes contain the country odd contain all the rest of the data)
        oecd_stats = []
        oecd_stats.append(country(oecd_content[i]))
        oecd_stats.append(first_sat(oecd_content[i + 1]))
        oecd_stats.append(first_sat_year(oecd_content[i + 1]))
        oecd_stats.append(sats_in_orbit(oecd_content[i + 1]))
        oecd_stats.append(first_orbit(oecd_content[i + 1]))
        oecd_stats.append(first_orbit_year(oecd_content[i + 1]))
        oecd_stats.append(spaceports(oecd_content[i + 1]))
        oecd_stats.append(workforce(oecd_content[i + 1]))
        oecd_stats.append(revenue(oecd_content[i + 1]))
        oecd_stats.append(budget_gdp(oecd_content[i + 1]))
        oecd_stats.append(budget_percapita(oecd_content[i + 1]))
        oecd_list.append(oecd_stats)
    else:
        ''
    

In [8]:
oecd_list[5][7] = 6000
oecd_list[9][6] = 19

In [9]:
oecd_df = pd.DataFrame(oecd_list , columns = ['Country' , 'First satellite' , 'First satellite year' , 'Number of satellites' , 'First orbital launch' , 'First orbital launch year' , 'Spaceports' , 'Workforce' , 'Revenue' , 'budget % of GDP' , 'budget per capita'])
oecd_df

Unnamed: 0,Country,First satellite,First satellite year,Number of satellites,First orbital launch,First orbital launch year,Spaceports,Workforce,Revenue,budget % of GDP,budget per capita
0,Canada,Alouette 1,1962,59,,,1.0,11600,3900000000.0,0.014,7.7
1,France,Astérix,1965,37,Diamant A,1965.0,1.0,32200,12300000000.0,0.097,39.4
2,Germany,Azur,1969,51,,,1.0,9200,2800000000.0,0.045,22.0
3,Italy,San Marco 1,1964,22,,,1.0,7000,2300000000.0,0.069,23.6
4,Korea,KITSAT-1,1992,21,Naro Space Centre,2013.0,1.0,9797,2800000000.0,0.034,10.1
5,The Netherlands,Astronomical Netherlands Satellite,1974,15,,,,6000,966000000.0,0.016,9.2
6,Norway,Thor 1,1992,9,,,1.0,2700,1300000000.0,0.031,32.5
7,Switzerland,SwissCube-1,2009,15,,,,1500,,0.036,32.9
8,United Kingdom,Ariel,1962,565,Black Arrow,1971.0,6.0,48800,21600000000.0,0.022,9.8
9,United States,Explorer 1,1958,4529,Juno 1,1958.0,19.0,360000,211600000000.0,0.243,186.1


In [121]:
wbank_pop = pd.read_csv('/Users/sambickel-barlow/Desktop/PP434/Portfolio/CC6/API_SP.POP.TOTL_DS2_en_csv_v2_56.csv')
wbank_gdp = pd.read_csv('/Users/sambickel-barlow/Desktop/PP434/Portfolio/CC6/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_2.csv')

In [124]:
# Adjust World Bank naming convensions to match OECD report
wbank_pop['Country Name'] = wbank_pop['Country Name'].replace('Netherlands' , 'The Netherlands').replace('Korea, Rep.' , 'Korea')
wbank_gdp['Country Name'] = wbank_gdp['Country Name'].replace('Netherlands' , 'The Netherlands').replace('Korea, Rep.' , 'Korea')

wbank_pop_2022 = wbank_pop[wbank_pop['Country Name'].isin(oecd_df['Country'])][['Country Name' , '2022']]
wbank_gdp_2022 = wbank_gdp[wbank_gdp['Country Name'].isin(oecd_df['Country'])][['Country Name' , '2022']]

In [132]:
oecd_df_plus = oecd_df.merge(
    wbank_pop_2022 ,
    left_on = 'Country' ,
    right_on = 'Country Name'
)
oecd_df_plus['pop'] = oecd_df_plus['2022']
oecd_df_plus = oecd_df_plus.drop('2022' , axis= 1)
#oecd_df_plus = oecd_df_plus.drop('Country Name_x' , axis= 1)

oecd_df_plus = oecd_df_plus.merge(
    wbank_gdp_2022 ,
    left_on = 'Country' ,
    right_on = 'Country Name'
)
oecd_df_plus['gdp'] = oecd_df_plus['2022']
oecd_df_plus = oecd_df_plus.drop(labels='2022' , axis= 1)
#oecd_df_plus = oecd_df_plus.drop('Country Name_y' , axis= 1)

In [135]:
oecd_df_plus['Spaceports'] = oecd_df_plus['Spaceports'].fillna(0)

In [136]:
# Back into budget using both % of GDP and population. Verify that both are in the same ballpark
oecd_df_plus['budget_a'] = (oecd_df_plus['budget per capita'] * oecd_df_plus['pop'])
oecd_df_plus['budget_b'] = ((oecd_df_plus['budget % of GDP'] / 100) * oecd_df_plus['gdp'])
oecd_df_plus['budget_var'] = oecd_df_plus['budget_a'] - oecd_df_plus['budget_b']


In [152]:
# Calculate Workforce per budget (million dollars)
oecd_df_plus['wf_per_budget_mil'] = (oecd_df_plus['Workforce'] / oecd_df_plus['budget_a'])*1000000
oecd_df_plus['sat_per_budget_mil'] = (oecd_df_plus['Number of satellites'] / oecd_df_plus['budget_a'])*1000000
oecd_df_plus['sp_per_budget_mil'] = (oecd_df_plus['Spaceports'] / oecd_df_plus['budget_a'])*1000000

In [154]:
highlight_color = alt.condition(
    alt.datum.Country == 'United Kingdom',  # Highlight the UK
    alt.value('red'),  # Highlight color
    alt.value('#002FA7')  # Default color
)

sat_bar_chart = alt.Chart(oecd_df_plus).mark_bar().encode(
    x=alt.X(
        'Country:N',
        title='',
        sort='y',
        axis=alt.Axis(labelAngle=-45, grid=False)  # Rotate labels and remove gridlines
    ),
    y=alt.Y('sat_per_budget_mil:Q', title='Satellites per Million Dollars of Budget', axis=alt.Axis(grid=False)),
    color=highlight_color,
    tooltip=[
        'Country', 
        'Number of satellites',
        alt.Tooltip('budget_a:Q', title='Public Space Budget', format='$,.0f')  # Format the 'budget' field as dollars
    ]
).properties(
    title='Satellites relative to budget'
)

sat_bar_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [150]:
highlight_color = alt.condition(
    alt.datum.Country == 'United Kingdom',  # Highlight the UK
    alt.value('red'),  # Highlight color
    alt.value('#002FA7')  # Default color
)

wf_bar_chart = alt.Chart(oecd_df_plus).mark_bar().encode(
    x=alt.X(
        'Country:N',
        title='',
        sort='y',
        axis=alt.Axis(labelAngle=-45, grid=False)  # Rotate labels and remove gridlines
    ),
    y=alt.Y('wf_per_budget_mil:Q', title='Workforce per Million Dollars of Budget', axis=alt.Axis(grid=False)),
    color=highlight_color,
    tooltip=[
        'Country', 
        'Workforce',
        alt.Tooltip('budget_a:Q', title='Public Space Budget', format='$,.0f')  # Format the 'budget' field as dollars
    ]
).properties(
    title='Space Industry Workforce relative to budget'
)

wf_bar_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [155]:
highlight_color = alt.condition(
    alt.datum.Country == 'United Kingdom',  # Highlight the UK
    alt.value('red'),  # Highlight color
    alt.value('#002FA7')  # Default color
)

sp_bar_chart = alt.Chart(oecd_df_plus).mark_bar().encode(
    x=alt.X(
        'Country:N',
        title='',
        sort='y',
        axis=alt.Axis(labelAngle=-45, grid=False)  # Rotate labels and remove gridlines
    ),
    y=alt.Y('sp_per_budget_mil:Q', title='Spaceports per Million Dollars of Budget', axis=alt.Axis(grid=False)),
    color=highlight_color,
    tooltip=[
        'Country', 
        'Spaceports',
        alt.Tooltip('budget_a:Q', title='Public Space Budget', format='$,.0f')  # Format the 'budget' field as dollars
    ]
).properties(
    title='Spaceports relative to budget'
)

sp_bar_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [156]:
sat_bar_chart.save('sat_bar_chart.json')
wf_bar_chart.save('wf_bar_chart.json')
sp_bar_chart.save('sp_bar_chart.json')

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df