In [1]:
import pandas as pd
import altair as alt
import numpy as np
from vega_datasets import data
import vegafusion as vf
vf.enable_widget()

alt.renderers.enable('default')

RendererRegistry.enable('default')

In [2]:
df = pd.read_csv('data/billionaires.csv', parse_dates=['birthDate'])

# Task 1

**Find Anomaly:**
> Does the trend that university dropouts (`education`) tend to have the highest median `finalWorth` in a majority of the decades (derived from `birthDate`) persist on a per `industry_sector` basis, or are there anomalies to this pattern?

In [3]:
cols_to_keep = list(set(df.columns).difference(['industry_sector',
                                                'education']))

# Change the the dataframe to allow selection of variables
df_wide = pd.melt(df, cols_to_keep)

select_box = alt.binding_select(name="Color by: ",
                                options=list(df_wide['variable'].unique()))
selection = alt.selection_point(value='education',
                                fields=['variable'], bind=select_box)

In [57]:
task1 = alt.Chart(df_wide.dropna()).mark_line(
    point=True
).encode(
    x=alt.X('decade:N', title="Decade the Billionaire was Born in"),
    y=alt.Y('median(finalWorth):Q',
            title="Median Final Worth (in Millions USD)"),
    color=alt.Color('value', legend=alt.Legend(
        orient='none',
        legendX=300, legendY=10,
        direction='vertical',
        titleAnchor='start', title="Color Legend")),
    tooltip=['decade', 'median(finalWorth)', 'value']
).add_params(
    selection
).transform_filter(
    selection
).properties(
    width=500,
    height=300,
    title="Median Final Worth of Billionares " +
          "based on the Decade They Were Born in"
)

# Credit to https://github.com/altair-viz/altair/issues/965
task1

# Task 2

**Characterize Distribution:**

> For each `continent`, what is the distribution of the different `industry_sectors` and `gender` ratio in those industries?

In [72]:
country_ids = pd.read_csv('https://raw.githubusercontent.com/joelostblom/teaching-datasets/main/country-ids-and-continents.csv')
relevant_country_names = country_ids["Country"]

mapping = {'South America': 'Americas',
           'North America': 'Americas'}

country_ids.Continent = country_ids.Continent.replace(mapping)

In [73]:
world = data.world_110m.url
world_map = alt.topo_feature(world, "countries")

In [74]:
wdf_og = pd.merge(df, country_ids,  how='right',
                  left_on=['country', 'continent'],
                  right_on=['Country', 'Continent'])
wdf_og = wdf_og.dropna(subset=['ID'])

In [75]:
group_gender = wdf_og.fillna(0).groupby(['ID', 'Country', 'Continent'])['gender'].value_counts().unstack(fill_value=0).reset_index()
group_inds = wdf_og.fillna(0).groupby(['ID', 'Country', 'Continent'])['industry_sector'].value_counts().unstack(fill_value=0).reset_index()

wdf = pd.merge(group_gender, group_inds,
               how='left',
               left_on=['ID', 'Country', 'Continent'],
               right_on=['ID', 'Country', 'Continent'])
wdf.ID = wdf.ID.apply(lambda x: int(x))
wdf = wdf.drop(['0_x', '0_y'], axis=1)

fields = wdf.columns.to_list()[1:]

In [76]:
color_range = ['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377']
cont_domain = ['Asia', 'Europe', 'Americas', 'Africa', 'Antarctica', 'Oceania']

click = alt.selection_point(encodings=['color'])

chloropleth = alt.Chart(world_map).mark_geoshape(
    stroke=None
).transform_lookup(
    lookup='id', from_=alt.LookupData(data=wdf, key='ID', fields=fields)
).encode(
    color = alt.condition(click,
                          alt.Color('Continent:O', scale=alt.Scale(domain=cont_domain, range=color_range)),
                          alt.value('lightgray')),
    tooltip = alt.Tooltip(['Country:O',
                           'Continent:O',
                           'Female:Q',
                           'Male:Q'])
).project(
    type="equalEarth"
).properties(
    width=600,
    height=300
).transform_filter(
    'isValid(datum.Continent)'
).add_params(
    click
)

In [77]:
gender_range = ['lightpink', 'lightblue']

barplot = alt.Chart(wdf_og).mark_bar().encode(
    y=alt.Y('industry_sector', title="Industry Sector"),
    x=alt.X('count()', title="Count of Billionaires"),
    color=alt.Color('gender',
                    scale=alt.Scale(domain=['Female', 'Male'],
                                    range=gender_range),
                    legend=alt.Legend(title="Gender")),
).transform_filter(
    'isValid(datum.gender)'
).properties(
    width=150,
    height=250
)

In [78]:
task2 = alt.hconcat(
    chloropleth,
    barplot.transform_filter(click),
    center=True,
    title="Billionaire Gender Ratio By Continent",
).resolve_scale(
    color='independent'
).configure_legend(
    orient='bottom',
    direction='horizontal',
)
task2

# Task 3

**Correlation:**

> Does the socio-economic status of a country (`economic_rating`, `GDP`, `life_expectancy`) correlate to the country’s median billionaire final net worth?

In [12]:
req = ['finalWorth', 'country', 'gdp_country', 'life_expectancy_country']

In [13]:
df['medianFinalWorth'] = df.groupby(['country'])['finalWorth'].transform('median')
req = ['medianFinalWorth', 'country', 'gdp_country', 'life_expectancy_country', 'economic_class', 'e_class']
task3df = df[req].drop_duplicates().reset_index().drop('index', axis=1)

In [54]:
splot_brush = alt.selection_interval(encodings=['x', 'y'], empty=True)
gdp_splot = alt.Chart(task3df).mark_point(size=40).encode(
    alt.X('gdp_country',
          title='Country GDP (billion USD)').scale(type='log'),
    alt.Y('medianFinalWorth',
          title='Median Final Net Worth (million USD)'),
    color=alt.condition(splot_brush,
                        'e_class:N',
                        alt.value('lightgray'),
                        title='Economic Class'),
    tooltip=[
             alt.Tooltip('country',
                         title='Country'),
             alt.Tooltip('gdp_country',
                         title='Country GDP (billion USD)'),
             alt.Tooltip('medianFinalWorth',
                         title='Median Final Net worth (million USD)'), 
             ]
).add_params(
    splot_brush
)

life_splot = alt.Chart(task3df).mark_point(size=40).encode(
    alt.X('life_expectancy_country',
          title='Country Life Expectancy (Years)').scale(zero=False),
    alt.Y('medianFinalWorth', title='Median Final Net Worth (million USD)'),
    alt.Color('e_class:N'),
    tooltip=[
             alt.Tooltip('country',
                         title='Country'),
             alt.Tooltip('life_expectancy_country',
                         title='Country Life Expectancy'),
             alt.Tooltip('medianFinalWorth',
                         title='Median Final Net worth (million USD)'),
             ]
)

task3 = (gdp_splot | life_splot.encode(
    color=alt.condition(splot_brush,
                        'e_class:N',
                        alt.value('lightgray')),
).add_params(splot_brush)).properties(
    title="Median Billionaire Networth and Country's Socio-Economic Status",
)
task3

# Task 4

**Sort:**
> What is the difference between the total final worth of the top 5 billionaires compared to the next 15 (6-20) billionaires in each `industry_sector`, `continent` and economic class?

In [79]:
df_top20_sector = df
df_top20_sector = df_top20_sector[df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False) <= 20]
df_top20_sector['rank'] = df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False)
df_top20_sector['rank'] = df_top20_sector['rank'].apply(lambda x: 'Top 5' if x <= 5.0 else 'Next 15')
# Warning is expected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['rank'] = df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['rank'] = df_top20_sector['rank'].apply(lambda x: 'Top 5' if x <= 5.0 else 'Next 15')


In [103]:
eclass_ordered = ['Developed', 'Emerging', 'Developing']

task4 = alt.Chart(df_top20_sector).mark_bar().encode(
    alt.X('sum(finalWorth):Q').title('Final Net Worth (Billion USD)'),
    alt.Y('rank:N').title(''),
    alt.Color('e_class:N', scale=alt.Scale(range=['#8da0cb', '#2ca02c', '#fc8d62'])).title('Economic Class'),
    alt.Row('industry_sector:N',
            header=alt.Header(labelAngle=0,
                              labelAlign='left')).title('Industry Sector'),
    alt.Order('economic_class:N'),
    alt.Tooltip(['sum(finalWorth)', 'economic_class', 'count()'])
).properties(
    title='Total Final Net Worth of Top 20 Billionaires by Industry Sector'
)
task4

# Task 5

**Find Extremum:**
> Which countries have the greatest ratio of total billionaire net worth to `GDP`, which countries (that have billionaires) have the least?

In [48]:
gdf = df.groupby(['country',
                  'continent',
                  'gdp_country']).agg({'personName': 'count',
                                       'finalWorth': 'sum'}).rename(
    columns={'personName': 'num_billionaires',
             'finalWorth': 'totalWorth'}
).reset_index()

In [71]:
bind = alt.selection_interval(bind='scales')
color_range = ['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377']
cont_domain = ['Asia', 'Europe', 'Americas', 'Africa', 'Antarctica', 'Oceania']


task5 = alt.Chart(gdf).mark_circle().encode(
    y=alt.Y("gdp_country:Q",
            scale=alt.Scale(type='log'),
            title="Country GDP (per Billion USD)"),
    x=alt.X("totalWorth",
            scale=alt.Scale(type='log'),
            title="Total Net Worth of Country's Billionaires"),
    size=alt.Size("num_billionaires",
                  scale=alt.Scale(range=[25, 800]),
                  title="Number of Billionaires"),
    color=alt.Color("continent", title="Continent", scale=alt.Scale(domain=cont_domain,
                                                                    range=color_range)),
    tooltip=["country", 'num_billionaires', 'gdp_country:Q'],
).properties(
    height=500,
    width=500,
    title="Billionaire Net Worth to GDP Ratio",
).add_params(
    bind
)
task5

# Task 6

**Cluster:**
> Cluster billionaires based on their educational backgrounds. Do certain billionaires with specific education backgrounds cluster into certain industries?

**High Fidelity Drawing Task**

# Task 7

**Find Range:**

> What is the final net worth range between each `gender` based on `continent`, `industry_sector` and `education_level`?

In [34]:
task7_continent = alt.Chart(df.dropna()).mark_bar().encode(
    alt.X('min(finalWorth):Q',
          axis=alt.Axis(grid=False)).title('Final Net Worth Range (Billion USD)'),
    alt.X2('max(finalWorth):Q'),
    alt.Y('gender:N').title(''),
    alt.Color('gender:N',
              scale=alt.Scale(domain=['Female', 'Male'],
                              range=gender_range),).title('Gender'),
    alt.Row('continent:N').title('Continent')
).properties(
    width=150,
    height=30
)

In [35]:
task_7_education = task7_continent.encode(
    alt.Row('education:N').title('Education Level')
).properties(
    width=150,
    height=30
)

In [36]:
task_7_industry_sector = task7_continent.encode(
    alt.Row('industry_sector:N',
            header=alt.Header(labelAngle=0,
                              labelAlign='left')).title('Industry Sector')
).properties(
    width=150,
    height=30
)

In [37]:
task_7_economic_class = task7_continent.encode(
    alt.Row('economic_class:N').title('Economic Class')
).properties(
    width=300,
    height=64
)

In [55]:
task_7_plot = (task7_continent | task_7_economic_class) & \
              (task_7_education | task_7_industry_sector)
task_7_plot = task_7_plot.properties(
    title="Distribution of Range of Net Worth Compared to Various Relevant Attributes",
)
task_7_plot