In [1]:
import pandas as pd
import altair as alt
import numpy as np

alt.renderers.enable('default')

RendererRegistry.enable('default')

In [2]:
df = pd.read_csv('data/billionaires.csv', parse_dates=['birthDate'])
# df['education'] = df['education'].astype('Int64')

In [3]:
# Scale GDP to millions (matching finalWorth)
df['gdp_country'] = df['gdp_country']/1000000

In [4]:
# df = df.replace('', np.nan, regex=True)

# Task 1

In [14]:
cols_to_keep = list(set(df.columns).difference(['industry_sector', 'education']))

# Change the the dataframe to allow selection of variables
df_wide = pd.melt(df, cols_to_keep)

select_box = alt.binding_select(name="Color by: ", options=list(df_wide['variable'].unique()))
selection = alt.selection_point(value='education', fields=['variable'], bind=select_box)

viz1 = alt.Chart(df_wide.dropna()).mark_line(
    point=True
).encode(
    x=alt.X('decade:N', title="Decade the Billionaire was Born in"),
    y=alt.Y('median(finalWorth):Q', title="Median Final Worth (in Millions USD)"),
    color=alt.Color('value', legend=alt.Legend(
        orient='none',
        legendX=300, legendY=10,
        direction='vertical',
        titleAnchor='start', title="Color Legend")),
    tooltip=['decade', 'median(finalWorth)', 'value']
).add_selection(
    selection
).transform_filter(
    selection
).properties(
    width=500,
    height=300,
    title="Median Final Worth of Billionares (in millions USD) based on the Decade They Were Born in"
)

# credit to https://github.com/altair-viz/altair/issues/965
viz1

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [6]:
# df['industries'].value_counts(dropna=False)

# Task 2

In [11]:
gdf = df.groupby(['country', 'continent', 'gdp_country']).agg({'personName':'count', 'finalWorth': 'sum'}).rename(
    columns = {'personName':'num_billionaires', 'finalWorth':'totalWorth'}
).reset_index()

In [12]:
gdf.head()

Unnamed: 0,country,continent,gdp_country,num_billionaires,totalWorth
0,Argentina,Americas,449663.0,4,11000
1,Australia,Oceania,1392680.0,43,173500
2,Austria,Europe,446315.0,11,75400
3,Belgium,Europe,529607.0,3,41200
4,Brazil,Americas,1839760.0,44,104800


In [13]:
bind = alt.selection_interval(bind='scales')

# input_checkbox = alt.binding_checkbox()
# scale_select = alt.selection_point(bind=input_checkbox)
# type_checkbox_condition = alt.condition(scale_select,
#                                         alt.Scale(type='log'),
#                                         alt.Scale(type='linear')
#                                        )

alt.Chart(gdf).mark_circle().encode(
    y=alt.Y("gdp_country:Q",
            scale=alt.Scale(type='log'),
            # scale=type_checkbox_condition,
            title="Country GDP (per million USD)"),
    x=alt.X("totalWorth",
            scale=alt.Scale(type='log'),
            # scale=type_checkbox_condition,
            title="Total Net Worth of Country's Billionaires"),
    size=alt.Size("num_billionaires", scale=alt.Scale(range=[25, 800])),
    color="continent",
    tooltip=["country", 'num_billionaires', 'gdp_country:Q'],
).properties(
    height=500,
    width=500
).add_params(
    bind
    # type_checkbox_condition
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


# Choropleth (Task 2)

In [15]:
country_ids = pd.read_csv('https://raw.githubusercontent.com/joelostblom/teaching-datasets/main/country-ids-and-continents.csv')
relevant_country_names = country_ids["Country"]

mapping = {'South America': 'Americas',
           'North America': 'Americas'}

country_ids.Continent = country_ids.Continent.replace(mapping)
country_ids

Unnamed: 0,ID,Country,Continent
0,4,Afghanistan,Asia
1,8,Albania,Europe
2,12,Algeria,Africa
3,24,Angola,Africa
4,10,Antarctica,Antarctica
...,...,...,...
169,704,Vietnam,Asia
170,732,Western Sahara,Africa
171,887,Yemen,Asia
172,894,Zambia,Africa


In [16]:
from vega_datasets import data
import vegafusion as vf
vf.enable_widget()
alt.renderers.enable('default')

world = data.world_110m.url
world_map = alt.topo_feature(world, "countries")

# # Create a chart using the data above and geoshape mark
# country_map = alt.Chart(world_map).mark_geoshape()
# # Show the chart
# country_map

In [17]:
wdf_og = pd.merge(df, country_ids,  how='right', left_on=['country', 'continent'], right_on = ['Country', 'Continent'])
wdf_og = wdf_og.dropna(subset=['ID'])
wdf_og

Unnamed: 0,finalWorth,personName,age,country,industries,status,gender,birthDate,gdp_country,life_expectancy_country,total_tax_rate_country,economic_class,education,continent,industry_sector,year,decade,ID,Country,Continent
0,,,,,,,,NaT,,,,,,,,,,4,Afghanistan,Asia
1,,,,,,,,NaT,,,,,,,,,,8,Albania,Europe
2,,,,,,,,NaT,,,,,,,,,,12,Algeria,Africa
3,,,,,,,,NaT,,,,,,,,,,24,Angola,Africa
4,,,,,,,,NaT,,,,,,,,,,10,Antarctica,Antarctica
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2597,1300.0,Nguyen Dang Quang,59.0,Vietnam,Food & Beverage,D,M,1963-08-23,261921.0,75.3,37.6,3.0,,Asia,Consumer Discretionary & Staples,1963.0,1960.0,704,Vietnam,Asia
2598,,,,,,,,NaT,,,,,,,,,,732,Western Sahara,Africa
2599,,,,,,,,NaT,,,,,,,,,,887,Yemen,Asia
2600,,,,,,,,NaT,,,,,,,,,,894,Zambia,Africa


In [18]:
group_gender = wdf_og.fillna(0).groupby(['ID', 'Country', 'Continent'])['gender'].value_counts().unstack(fill_value=0).reset_index()
group_inds = wdf_og.fillna(0).groupby(['ID', 'Country', 'Continent'])['industry_sector'].value_counts().unstack(fill_value=0).reset_index()

wdf = pd.merge(group_gender, group_inds,  how='left', left_on=['ID', 'Country','Continent'], right_on = ['ID', 'Country', 'Continent'])
wdf.ID = wdf.ID.apply(lambda x: int(x))
wdf = wdf.drop(['0_x', '0_y'], axis=1)
wdf

Unnamed: 0,ID,Country,Continent,F,M,Consumer Discretionary & Staples,Diversified,Energy & Industrials & Materials,Financials,Service,Technology
0,4,Afghanistan,Asia,0,0,0,0,0,0,0,0
1,8,Albania,Europe,0,0,0,0,0,0,0,0
2,10,Antarctica,Antarctica,0,0,0,0,0,0,0,0
3,12,Algeria,Africa,0,0,0,0,0,0,0,0
4,24,Angola,Africa,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
169,858,Uruguay,Americas,0,0,0,0,0,0,0,0
170,860,Uzbekistan,Asia,0,0,0,0,0,0,0,0
171,862,Venezuela,Americas,0,0,0,0,0,0,0,0
172,887,Yemen,Asia,0,0,0,0,0,0,0,0


In [19]:
wdf.Continent.unique()

array(['Asia', 'Europe', 'Antarctica', 'Africa', 'Americas', 'Oceania'],
      dtype=object)

In [20]:
fields=wdf.columns.to_list()[1:]
# fields=['Country', 'Continent']
fields

['Country',
 'Continent',
 'F',
 'M',
 'Consumer Discretionary & Staples',
 'Diversified',
 'Energy & Industrials & Materials',
 'Financials',
 'Service',
 'Technology']

In [21]:
color_range =  ['#7fc97f','#beaed4', '#fdc086', '#795227','#386cb0', '#f0027f'] 

# sel_hover = alt.selection_point(on='mouseover', empty='none')
click = alt.selection_multi(encodings=['color'])

choropleth = alt.Chart(world_map).mark_geoshape(
    stroke=None
).transform_lookup(
    lookup='id', from_=alt.LookupData(data=wdf, key='ID', fields=fields)
).encode(
    # alt.Color('Continent:O', scale=alt.Scale(range=color_range)),
    color = alt.condition(click, alt.Color('Continent:O', scale=alt.Scale(range=color_range)), alt.value('lightgray')),
    tooltip = alt.Tooltip(['Country:O', 'Continent:O', 'F:Q', 'M:Q'])
).project(
    type="equalEarth"
).properties(
    width=600,
    height=300
).transform_filter(
    'isValid(datum.Continent)'
).add_params(
    click
)

choropleth

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [22]:
gender_range = ['lightpink', 'lightblue']

# cdf = wdf_og.loc[wdf_og.Continent == 'Asia']

barplot = alt.Chart(wdf_og).mark_bar().encode(
    y=alt.Y('industry_sector', title="Industry Sector"),
    x=alt.X('count()', title="Count of Billionaires"),
    color=alt.Color('gender', scale=alt.Scale(domain = ['F', 'M'], range=gender_range), legend=alt.Legend(title="Gender")),
).transform_filter(
    'isValid(datum.gender)'
).properties(
    width=150,
    height=250
)
barplot

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [23]:
alt.hconcat(
    choropleth,
    barplot.transform_filter(click),
    center=True,
    title="Billionaire Gender Ratio By Continent",
).resolve_scale(
    color='independent'
).configure_legend(
    orient='bottom',
    direction='horizontal',
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


# Task 3

In [24]:
req = ['finalWorth', 'country', 'gdp_country', 'life_expectancy_country']
df[req].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2657 entries, 0 to 2656
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   finalWorth               2657 non-null   int64  
 1   country                  2657 non-null   object 
 2   gdp_country              2545 non-null   float64
 3   life_expectancy_country  2545 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 83.2+ KB


In [25]:
#countries with null values in gdp_country
df[pd.isnull(df['gdp_country'])].country.unique()

array(['Hong Kong', 'Taiwan'], dtype=object)

In [26]:
#countries with null values in life_expectancy_country
df[pd.isnull(df['life_expectancy_country'])].country.unique()

array(['Hong Kong', 'Taiwan'], dtype=object)

In [27]:
#filling in the null values with the appropriate values (information obtained from Google) and manipulating dataset to fulfill needs
df['e_class'] = ''
df.loc[df['country'] == 'Hong Kong', 'gdp_country'] = 91400000000
df.loc[df['country'] == 'Hong Kong', 'life_expectancy_country'] = 85.3
df.loc[df['country'] == 'Taiwan', 'gdp_country'] = 185233000000
df.loc[df['country'] == 'Taiwan', 'life_expectancy_country'] = 81.0
df.loc[df['economic_class'] == 1, 'e_class'] = 'Developed'
df.loc[df['economic_class'] == 2, 'e_class'] = 'Emerging'
df.loc[df['economic_class'] == 3, 'e_class'] = 'Developing'


df['gdp_country'] = df['gdp_country'] / 10000000000

In [28]:
df['medianFinalWorth'] = df.groupby(['country'])['finalWorth'].transform('median')
req = ['medianFinalWorth', 'country', 'gdp_country', 'life_expectancy_country', 'economic_class', 'e_class']
task3df = df[req].drop_duplicates().reset_index().drop('index', axis=1)
task3df.head()

Unnamed: 0,medianFinalWorth,country,gdp_country,life_expectancy_country,economic_class,e_class
0,4200.0,France,0.000272,82.5,1,Developed
1,2900.0,United States,0.002143,78.5,1,Developed
2,4000.0,Mexico,0.000126,75.0,2,Emerging
3,2100.0,India,0.000261,69.4,2,Emerging
4,2100.0,Spain,0.000139,83.3,1,Developed


In [29]:
splot_brush = alt.selection_interval(encodings=['x', 'y'], empty=False)
gdp_splot = alt.Chart(task3df).mark_point().encode(
    alt.X('gdp_country', title='Country GDP (billion USD)').scale(type='log'),
    alt.Y('medianFinalWorth', title='Median Final Net Worth (million USD)'),
     color = alt.condition(splot_brush, 'e_class:N', alt.value('lightgray'), title='Economic Class'),
    tooltip=[
             alt.Tooltip('country', title='Country'),
             alt.Tooltip('gdp_country', title='Country GDP (billion USD)'),
             alt.Tooltip('medianFinalWorth', title='Median Final Net worth (million USD)'), 
             ]
).add_params(splot_brush)

life_splot = alt.Chart(task3df).mark_point().encode(
    alt.X('life_expectancy_country', title='Country Life Expectancy (Years)').scale(zero=False),
    alt.Y('medianFinalWorth', title='Median Final Net Worth (million USD)'),
    alt.Color('e_class:N'),
    tooltip=[
             alt.Tooltip('country', title='Country'),
             alt.Tooltip('life_expectancy_country', title='Country Life Expectancy'),
             alt.Tooltip('medianFinalWorth', title='Median Final Net worth (million USD)'), 
             ]
)

gdp_splot | life_splot.encode(
    color = alt.condition(splot_brush, 'e_class:N', alt.value('lightgray'))
).add_params(splot_brush)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Task 4

In [30]:
df_top20_sector = df
df_top20_sector = df_top20_sector[df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False) <= 20]
df_top20_sector['rank'] = df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False)
df_top20_sector['rank'] = df_top20_sector['rank'].apply(lambda x: 'Top 5' if x <= 5.0 else 'Next 15')
df_top20_sector

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['rank'] = df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['rank'] = df_top20_sector['rank'].apply(lambda x: 'Top 5' if x <= 5.0 else 'Next 15')


Unnamed: 0,finalWorth,personName,age,country,industries,status,gender,birthDate,gdp_country,life_expectancy_country,total_tax_rate_country,economic_class,education,continent,industry_sector,year,decade,e_class,medianFinalWorth,rank
0,211000,Bernard Arnault & family,74.0,France,Fashion & Retail,U,M,1949-03-05,0.000272,82.5,60.7,1,3.0,Europe,Consumer Discretionary & Staples,1949.0,1940.0,Developed,4200.0,Top 5
1,80500,Francoise Bettencourt Meyers & family,69.0,France,Fashion & Retail,U,F,1953-07-10,0.000272,82.5,60.7,1,,Europe,Consumer Discretionary & Staples,1953.0,1950.0,Developed,4200.0,Top 5
2,40100,François Pinault & family,86.0,France,Fashion & Retail,D,M,1936-08-21,0.000272,82.5,60.7,1,0.0,Europe,Consumer Discretionary & Staples,1936.0,1930.0,Developed,4200.0,Next 15
4,9800,"Jacques Saadé, Jr.",51.0,France,Logistics,Split Family Fortune,M,1971-08-10,0.000272,82.5,60.7,1,,Europe,Service,1971.0,1970.0,Developed,4200.0,Next 15
5,9800,Rodolphe Saadé,53.0,France,Logistics,Split Family Fortune,M,1970-03-03,0.000272,82.5,60.7,1,,Europe,Service,1970.0,1970.0,Developed,4200.0,Next 15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115,9200,Antonia Ax:son Johnson & family,79.0,Sweden,Diversified,U,F,1943-09-06,0.000053,82.5,49.1,1,4.0,Europe,Diversified,1943.0,1940.0,Developed,2150.0,Next 15
2139,14900,Dhanin Chearavanont,83.0,Thailand,Diversified,U,M,1939-04-19,0.000054,76.9,29.5,2,,Asia,Diversified,1939.0,1930.0,Emerging,1950.0,Next 15
2215,8200,Abdulsamad Rabiu,62.0,Nigeria,Diversified,U,M,1960-08-04,0.000045,54.3,34.8,2,,Africa,Diversified,1960.0,1960.0,Emerging,8200.0,Next 15
2216,8200,Abdulsamad Rabiu,62.0,Nigeria,Diversified,U,M,1960-08-04,0.000045,54.3,34.8,3,,Africa,Diversified,1960.0,1960.0,Developing,8200.0,Next 15


In [31]:
alt.Chart(df_top20_sector).mark_bar().encode(
    alt.X('sum(finalWorth):Q', ).title('Final Net Worth (Million USD)'),
    alt.Y('rank:N').title(''),
    alt.Color('economic_class:O').title('Economic Class'),
    alt.Row('industry_sector:N', header=alt.Header(labelAngle=0)).title('Industry Sector'),
    alt.Tooltip(['sum(finalWorth)', 'economic_class', 'count()'])
).properties(
    title='Total Final Net Worth of Top 20 Billionaires by Industry Sector'
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Task 7

In [32]:
# Trying to add interaction that selects the attribute to be encoded on the Row channel... Not working... Need to ask TA

#task_7_row_attr = ['continent', 'industry_sector', 'education', 'economic_class']
#task_7_drop_down = alt.binding_select(options=task_7_row_attr, name='Attribute:')
#task_7_select = alt.selection_point(fields=['continent'], bind=task_7_drop_down)
#task_7_op_rowAttr = alt.param(value='continent', bind=task_7_drop_down)

In [33]:
#task_7_chart_base = alt.Chart(df).mark_bar().encode(
#    alt.X('min(finalWorth):Q', axis=alt.Axis(grid=False)),
#    alt.X2('max(finalWorth):Q'),
#    alt.Y('gender:N'),
#    alt.Color('gender:N'),
#    row = task_7_select
#).transform_filter(
#    task_7_select
#).add_params(
#    task_7_select
#)
#task_7_chart_base

In [34]:
task7_continent = alt.Chart(df).mark_bar().encode(
    alt.X('min(finalWorth):Q', axis=alt.Axis(grid=False)).title('Final Net Worth Range (million USD)'),
    alt.X2('max(finalWorth):Q'),
    alt.Y('gender:N').title(''),
    alt.Color('gender:N').title('Gender'),
    alt.Row('continent:N').title('Continent')
)
#task7_continent

In [35]:
task_7_industry_sector = task7_continent.encode(
    alt.Row('industry_sector:N', header=alt.Header(labelAngle=0)).title('Industry Sector')
)
#task_7_industry_sector

In [36]:
task_7_education = task7_continent.encode(
    alt.Row('education:N').title('Education Level')
)
#task_7_education

In [37]:
task_7_economic_class = task7_continent.encode(
    alt.Row('economic_class:N').title('Economic Class')
)
#task_7_economic_class

In [38]:
task_7_plot = (task7_continent | task_7_industry_sector) & (task_7_education | task_7_economic_class)
task_7_plot

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
