# Milestone 2 - Implementation
## Team name: REK'D
### Group Members: Ruby, Eric, Kevin, Darryl

In [1]:
# imports
import pandas as pd
import altair as alt
import numpy as np
from vega_datasets import data
import vegafusion as vf
vf.enable_widget()

alt.renderers.enable('default')

RendererRegistry.enable('default')

In [2]:
# Suppress warnings generated from Ruby's laptop due to system warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Read in the dataset
df = pd.read_csv('data/billionaires.csv', parse_dates=['birthDate'])

# Task 1

**Find Anomaly:**
> Does the trend that university dropouts (`education`) tend to have the highest median `finalWorth` in a majority of the decades (derived from `birthDate`) persist on a per `industry_sector` basis, or are there anomalies to this pattern?

In [4]:
## Setup using Pandas for the visualization
# Keep non-selectable columns
cols_to_keep = list(set(df.columns).difference(['industry_sector',
                                                'education']))

# Melt the dataframe to allow selection of variables
df_wide = pd.melt(df, cols_to_keep)

select_box = alt.binding_select(name="View change in median net worth over the decades based on: ",
                                options=list(df_wide['variable'].unique()))
selection = alt.selection_point(value='education',
                                fields=['variable'], bind=select_box)

In [5]:
# Extra explanatory legend
task1_text = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=300,
    color="gray"
).encode(
    text=alt.value(["The numbers in the Education Level represent the",
                    "highest education level obtained by the billionaire:",
        "- 0: Primary education",
        "- 1: Secondary education",
        "- 2: University Drop Outs",
        "- 3: Bachelor's or equivalent (diploma, associate, etc.)",
        "- 4: Master's or equivalent",
        "- 5: for MD, Ph. D, Doctor or equivalent"])
)

# Add custom title
task1_title = alt.TitleParams(
    "Viz 1: Median Final Worth of Billionares based on the Decade They Were Born in",
    subtitle=["Use the drop down menu to view the trends by Education Level or Industry Sector."],
    subtitleColor='gray',
    orient="top",
    anchor="start"
)

In [6]:
# Create a bar graph for the number of billionaires
task1_count = alt.Chart(df_wide.dropna()).mark_bar(
    point=True
).encode(
    x=alt.X('decade:N', title=None,
           scale=alt.Scale(domain=list(range(1920, 2010, 10))),
           axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('count():Q',
            title="Number of Billionaires",
           axis=alt.Axis(grid=False)),
    tooltip=alt.Tooltip(["decade", "count()"])
    ).properties(
    width=500,
    height=75,
)

### Final Visualization

In [7]:
# Finalized task 1 viz
# Used the following page for reference https://github.com/altair-viz/altair/issues/965
task1_trend = alt.Chart(df_wide.dropna(), title=task1_title).mark_line(
    point=True
).encode(
    x=alt.X('decade:N', title="Decade the Billionaire was Born in"),
    y=alt.Y('median(finalWorth):Q',
            title="Median Final Worth (in Billions of USD)"),
    color=alt.Color('value', legend=alt.Legend(
        orient='none',
        legendX=300, legendY=10,
        direction='vertical',
        titleAnchor='start', title="Color Legend")),
    tooltip=['decade', 'median(finalWorth)', 'value', 'count()']
).add_params(
    selection
).transform_filter(
    selection
).properties(
    width=500,
    height=250,
)

task1 = alt.vconcat(task1_trend, task1_count, spacing=-20).resolve_scale(
    color='independent'
)

# Task 1 individual has the education attribute's legend beside the chart.
# We will reposition the legend later in the dashboard.
task1_individual = alt.hconcat(task1, task1_text, spacing=-2)
task1_individual

# Task 2

**Characterize Distribution:**

> For each `continent`, what is the distribution of the different `industry_sectors` and `gender` ratio in those industries?

In [8]:
# Using Pandas to manipulate the geological data for easier plotting
country_ids = pd.read_csv('https://raw.githubusercontent.com/joelostblom/teaching-datasets/main/country-ids-and-continents.csv')
relevant_country_names = country_ids["Country"]

mapping = {'South America': 'Americas',
           'North America': 'Americas'}

country_ids.Continent = country_ids.Continent.replace(mapping)

In [9]:
# Similar to lecture examples, use the world data to make choropleth
world = data.world_110m.url
world_map = alt.topo_feature(world, "countries")

In [10]:
# Drop redundant columns
wdf_og = pd.merge(df, country_ids,  how='right',
                  left_on=['country', 'continent'],
                  right_on=['Country', 'Continent'])
wdf_og = wdf_og.dropna(subset=['ID'])

In [11]:
# Cleanup the final manipulated df for viz
group_gender = wdf_og.fillna(0).groupby(['ID', 'Country', 'Continent'])['gender'].value_counts().unstack(fill_value=0).reset_index()
group_inds = wdf_og.fillna(0).groupby(['ID', 'Country', 'Continent'])['industry_sector'].value_counts().unstack(fill_value=0).reset_index()

wdf = pd.merge(group_gender, group_inds,
               how='left',
               left_on=['ID', 'Country', 'Continent'],
               right_on=['ID', 'Country', 'Continent'])
wdf.ID = wdf.ID.apply(lambda x: int(x))
wdf = wdf.drop(['0_x', '0_y'], axis=1)

fields = wdf.columns.to_list()[1:]

In [12]:
# Use consistent colouring throughout our vizzes
color_range = ['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377']
cont_domain = ['Asia', 'Europe', 'Americas', 'Africa', 'Antarctica', 'Oceania']

click = alt.selection_point(encodings=['color'])

choropleth = alt.Chart(world_map).mark_geoshape(
    stroke=None
).transform_lookup(
    lookup='id', from_=alt.LookupData(data=wdf, key='ID', fields=fields)
).encode(
    color = alt.condition(click,
                          alt.Color('Continent:O',
                                    scale=alt.Scale(domain=cont_domain, range=color_range),
                                    legend=alt.Legend(orient='bottom', direction='horizontal')),
                          alt.value('lightgray')),
    tooltip = alt.Tooltip(['Country:O',
                           'Continent:O',
                           'Female:Q',
                           'Male:Q']),
    # legend = alt.Legend(orient='bottom', direction='horizontal')
).project(
    type="equalEarth"
).properties(
    width=750,
    height=300
).transform_filter(
    'isValid(datum.Continent)',
).add_params(
    click
)

In [13]:
gender_range = ['lightpink', 'lightblue']

barplot = alt.Chart(wdf_og).mark_bar().encode(
    y=alt.Y('industry_sector', title="Industry Sector"),
    x=alt.X('count()', title="Count of Billionaires"),
    color=alt.Color('gender',
                    scale=alt.Scale(domain=['Female', 'Male'],
                                    range=gender_range),
                    legend=alt.Legend(title="Gender", orient='bottom', direction='horizontal')),
).transform_filter(
    'isValid(datum.gender)'
).properties(
    width=150,
    height=250
)

### Final Visualization

In [14]:
task2 = alt.hconcat(
    choropleth,
    barplot.transform_filter(click),
    center=True,
    title="Viz 2: Billionaire Gender Ratio By Continent in Each Industry Sector",
).resolve_scale(
    color='independent'
)
task2

# Task 3

**Correlation:**

> Does the socio-economic status of a country (`economic_rating`, `GDP`, `life_expectancy`) correlate to the country’s median billionaire final net worth?

In [15]:
# List attributes of interest
req = ['finalWorth', 'country', 'gdp_country', 'life_expectancy_country']

In [16]:
# Use pandas to manipulate the df for easier plotting
df['medianFinalWorth'] = df.groupby(['country'])['finalWorth'].transform('median')
req = ['medianFinalWorth', 'country', 'gdp_country', 'life_expectancy_country', 'economic_class', 'e_class']
task3df = df[req].drop_duplicates().reset_index().drop('index', axis=1)

In [17]:
# Use this to control the dimensions of both plots
task3_width = 270
task3_height = 270
# Make the title for the plot
task3_title = alt.TitleParams(
    ["Viz 3: Median Billionaire Net Worth vs.","Their Country's Socio-Economic Status"],
    subtitle=["Click and drag on either charts to create a window filter."],
    subtitleColor='gray',
    orient="top",
    anchor="start"
)

In [18]:
splot_brush = alt.selection_interval(encodings=['x', 'y'], empty=True)

gdp_splot = alt.Chart(task3df, title=task3_title).mark_point(size=40).encode(
    alt.X('gdp_country',
          title='Country GDP (Billion USD)').scale(type='log'),
    alt.Y('medianFinalWorth',
          title='Median Final Net Worth (Billion USD)'),
    color=alt.condition(splot_brush,
                        alt.Color('e_class:N', title="Economic Class"),
                        alt.value('lightgray'),
                        title='Economic Class'),
    tooltip=[
             alt.Tooltip('country',
                         title='Country'),
             alt.Tooltip('gdp_country',
                         title='Country GDP (Billion USD)'),
             alt.Tooltip('medianFinalWorth',
                         title='Median Final Net worth (Billion USD)'), 
             ]
).add_params(splot_brush).properties(
    width=task3_width,
    height=task3_height
)

life_splot = alt.Chart(task3df).mark_point(size=40).encode(
    alt.X('life_expectancy_country',
          title='Country Life Expectancy (Years)').scale(zero=False),
    alt.Y('medianFinalWorth', title='Median Final Net Worth (Billion USD)'),
    alt.Color('e_class:N', title="Economic Class"),
    tooltip=[
             alt.Tooltip('country',
                         title='Country'),
             alt.Tooltip('life_expectancy_country',
                         title='Country Life Expectancy'),
             alt.Tooltip('medianFinalWorth',
                         title='Median Final Net worth (Billion USD)'),
             ]
).add_params(splot_brush).properties(
    width=task3_width,
    height=task3_height
)

### Final Visualization

In [19]:
task3 = alt.hconcat(gdp_splot,life_splot.encode(
    color=alt.condition(splot_brush,
                        alt.Color('e_class:N', title="Economic Class"),
                        alt.value('lightgray'))),
                    center=False,
                    spacing=2
                   ).add_params(splot_brush)

task3

# Task 4

**Sort:**
> What is the difference between the total final worth of the top 5 billionaires compared to the next 15 (6-20) billionaires in each `industry_sector`, `continent` and economic class?

In [20]:
# Use pandas to filter the df for easier plotting
df_top20_sector = df
df_top20_sector = df_top20_sector[df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False) <= 20]
df_top20_sector['rank'] = df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False)
df_top20_sector['rank'] = df_top20_sector['rank'].apply(lambda x: 'Top 5' if x <= 5.0 else 'Next 15')
# Warning is expected - we're chaining actions on a copy and this is intentional. .loc cannot achieve what we want
# See https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
# for more details

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['rank'] = df_top20_sector.groupby('industry_sector')['finalWorth'].rank(method='first', ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['rank'] = df_top20_sector['rank'].apply(lambda x: 'Top 5' if x <= 5.0 else 'Next 15')


In [21]:
# Remapping Names for fitting on viz
rename_mapping = {'Consumer Discretionary & Staples' : 'Consumer*',
                  'Energy & Industrials & Materials' : 'EIM*'}

df_top20_sector['industry_sector'] = df_top20_sector['industry_sector'].replace(rename_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top20_sector['industry_sector'] = df_top20_sector['industry_sector'].replace(rename_mapping)


### Final Visualization

In [22]:
# To enforce ordering on economic class to make sure the legend and the plots are accurate
eclass_ordered = ['Developed', 'Emerging', 'Developing']

task4 = alt.Chart(df_top20_sector).mark_bar().encode(
    alt.X('sum(finalWorth):Q', axis=alt.Axis(grid=False)).title('Final Net Worth (Billion USD)'),
    alt.Y('rank:N').title(''),
    alt.Color('e_class:N', scale=alt.Scale(),
              legend=alt.Legend(title="Economic Class", orient='bottom', direction='horizontal')).title('Economic Class'),
    alt.Row('industry_sector:N',
            header=alt.Header(labelAngle=0,
                              labelAlign='left')).title('Industry Sector'),
    alt.Order('economic_class:N'),
    alt.Tooltip(['sum(finalWorth)', 'count()'])
).properties(
    title=["Viz 4: Total Final Net Worth of Top 20 Billionaires","by Industry Sector"]
)
task4

# Task 5

**Find Extremum:**
> Which countries have the greatest ratio of total billionaire net worth to `GDP`, which countries (that have billionaires) have the least?

In [23]:
# Use pandas to manipulate the df for easier plotting
gdf = df.groupby(['country',
                  'continent',
                  'gdp_country']).agg({'personName': 'count',
                                       'finalWorth': 'sum'}).rename(
    columns={'personName': 'num_billionaires',
             'finalWorth': 'totalWorth'}
).reset_index()
gdf = gdf.rename(columns={'continent' : 'Continent'})

In [24]:
bind = alt.selection_interval(bind='scales')
# Use consistent colouring throughout our vizzes
color_range = ['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377']
cont_domain = ['Asia', 'Europe', 'Americas', 'Africa', 'Antarctica', 'Oceania']


### Final Visualization

In [25]:
task5 = alt.Chart(gdf).mark_circle(opacity=0.8).encode(
    y=alt.Y("gdp_country:Q",
            scale=alt.Scale(type='log'),
            title="Country GDP (per Billion USD)"),
    x=alt.X("totalWorth",
            scale=alt.Scale(type='log'),
            title="Total Net Worth of Country's Billionaires"),
    size=alt.Size("num_billionaires",
                  scale=alt.Scale(range=[100, 800]),
                  title="Number of Billionaires"),
    color=alt.Color("Continent", title="Continent", scale=alt.Scale(domain=cont_domain,
                                                                    range=color_range)),
    tooltip=["country", 'num_billionaires', 'gdp_country:Q'],
).properties(
    height=500,
    width=500,
    title="Viz 5: Billionaire Net Worth vs. Their Residence Country's GDP",
).add_params(
    bind,
    click
)
task5

# Task 6

**Cluster:**
> Cluster billionaires based on their educational backgrounds. Do certain billionaires with specific education backgrounds cluster into certain industries?

**This is our High Fidelity Drawing Task** and will be included in the corresponding Gradescope submission and written report.

# Task 7

**Find Range:**
> Find Range: What is the final net worth interquartile range between each `gender` based on `continent`, `industry sector` and `education level`?

In [26]:
# Remapping Names for fitting on viz
rename_mapping = {'Consumer Discretionary & Staples' : 'Consumer*',
                  'Energy & Industrials & Materials' : 'EIM*'}

df['industry_sector'] = df['industry_sector'].replace(rename_mapping)

In [27]:
task_7_continent = alt.Chart(df.dropna()).mark_bar().encode(
    alt.X('q1(finalWorth):Q',
          axis=alt.Axis(grid=False)).title('Final Net Worth Range (Billion USD)'),
    alt.X2('q3(finalWorth):Q'),
    alt.Y('gender:N', axis=None).title(''),
    alt.Color('gender:N',
              scale=alt.Scale(domain=['Female', 'Male'],
                              range=gender_range),
              legend=alt.Legend(title="Gender", orient='bottom', direction='horizontal')
             ).title('Gender'),
    alt.Row('continent:N',
            sort=eclass_ordered, # bug in Vega-Lite, cannot order alt.Row
            header=alt.Header(labelAngle=0,
                              labelAlign='left')).title('Continent'),
    alt.Tooltip(["q1(finalWorth):Q", "q3(finalWorth):Q", "count()"])
).properties(
    width=100,
    height=28
)
# See https://github.com/altair-viz/altair/issues/2237
# for more details on the bug

In [28]:
task_7_education = task_7_continent.encode(
    alt.Row('education:N',
            header=alt.Header(labelAngle=0,
                              labelAlign='left')).title('Education Level')
).properties(
    width=100,
    height=20
)

In [29]:
task_7_industry_sector = task_7_continent.encode(
    alt.Row('industry_sector:N',
            header=alt.Header(labelAngle=0,
                              labelAlign='left')).title('Industry Sector')
).properties(
    width=100,
    height=20
)

In [30]:
eclass_ordered = ['Developed', 'Emerging', 'Developing']

task_7_economic_class = task_7_continent.encode(
    alt.Row('e_class:O',
            header=alt.Header(labelAngle=0,
                              labelAlign='left'),
            sort=eclass_ordered).title('Economic Class')
).properties(
    width=90,
    height=61
)

In [31]:
# Make the title for the plot
task7_title = alt.TitleParams(
    ["Viz 7: Distribution of Net Worth Ranges Compared to Various Relevant Attributes"],
    subtitle=["Range refers to the Interquartile Range (Q1 to Q3)."],
    subtitleColor='gray',
    orient="top",
    anchor="start"
)

### Final Visualization

In [32]:
task7 = (task_7_continent | task_7_economic_class) | \
              (task_7_education | task_7_industry_sector)
task7 = task7.properties(
    title=task7_title,
)
task7

# Dashboards

In [33]:
# Add text to each viz
text1 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color='gray',
).encode(
    text=alt.value(["Task 1: " + "Find anomaly: Does the trend that university dropouts (education level) tend to have the highest median finalWorth",
                    "in a majority of the decades (derived from birthDate) persist on a per industry sector basis, or are there anomalies to this pattern?"])
)

text2 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color='gray',
).encode(
    text=alt.value(["Task 2: Characterize Distribution: For each continent, " +
    "what is the distribution of the different industry sectors and gender ratio in those industries?"])
)

text3 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color='gray'
).encode(
    text=alt.value(["Task 3: " + "Correlation: Does the socio-economic status of a country correlate to"+
                    "their median billionaire final net worth?"])
)

text4 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=8,
    fontWeight=400,
    color='gray'
).encode(
    text=alt.value(["Task 4:", "Sort: What is the difference between the total final worth",
                    "of the top 5 billionaires compared to the next 15 (6-20)",
                    "billionaires in each sector, continent and economic class?"])
)

text5 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color='gray'
).encode(
    text=alt.value(["Task 5: Find Extremum - Which countries have the greatest ratio of total billionaire net worth to GDP" +
    " and which countries (that have billionaires) have the least?"])
)


text6 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color='gray'
).encode(
    text=alt.value(["Task 1:", "Cluster: Cluster billionaires based on their educational backgrounds." +
    " Do certain billionaires with specific education backgrounds cluster into certain industries?"])
)

text7 = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color='gray'
).encode(
    text=alt.value(["Task 7 - Find Range: What is the net worth's interquartile range between each gender" +
    " based on continent, industry sector and education level?"])
)

dashboard2_legend = alt.Chart().mark_text(
    align="left",
    baseline="middle",
    fontSize=10,
    fontWeight=400,
    color="black"
).encode(
    text=alt.value(["The numbers in the Education Level represent the " +
                    "highest education level obtained by the billionaire:",
        "(0: Primary education, " +
        "1: Secondary education, " +
        "2: University Drop Outs, " +
        "3: Bachelor's or equivalent (diploma, associate, etc.), " +
        "4: Master's or equivalent, " +
        "5: for MD, Ph. D, Doctor or equivalent)"])
)

dashboard1_text = alt.Chart().mark_text(
    align="left",
    baseline="bottom",
    fontSize=12,
    fontWeight=400,
    color="gray"
).encode(
    text=alt.value(["Click on the world map to select and filter by the continent of interest. Double click the continent to reset the selection."])
)

dashboard2_text = alt.Chart().mark_text(
    align="left",
    baseline="bottom",
    fontSize=12,
    fontWeight=400,
    color="gray"
).encode(
    text=alt.value(["*Note: For Industry Sectors, Consumer abbreviates consumer discretionary and staples. EIM abbreviates Energy & Industrials & Materials."])
)

## Dashboard 1

In [34]:
dashboard1 = alt.vconcat(
    dashboard1_text,
    task2,
    text2,
    task5.encode(color=alt.Color("Continent",
                              title="Continent",
                              scale=alt.Scale(domain=cont_domain, range=color_range)).legend(None)
                ).properties(height=220, width=950).transform_filter(click),
    text5,
    spacing=-2
).resolve_scale(
    color='independent',
    size='independent'
).properties(
    title=alt.Title("Global Billionaire Population based on Continent, Country GDP, and Industry", fontSize=20)
)

dashboard1

## Dashboard 2

### Set Up Top and Bottom Components

In [35]:
# Remove Legend from Task 3
task3_alt = alt.hconcat(gdp_splot, life_splot.encode(
    color=alt.condition(splot_brush,
                        alt.Color('e_class:N', title="Economic Class", legend=None),
                        alt.value('lightgray'))),
                    center=False,
                    spacing=-2
                   ).add_params(splot_brush)

In [36]:
# Make the top and bottom components
top = alt.hconcat(alt.vconcat(task7, text7, spacing=-2),
                  alt.vconcat(task4.properties(height=20, width=200), text4, spacing=-2),
                  center=True,
                  spacing=-1).resolve_scale(color='independent')

bottom = alt.hconcat(alt.vconcat(alt.vconcat(task1_trend, task1_count, spacing=-20).resolve_scale(color='independent'),
                                 text1, spacing=-4), # Altair bug so we have to paste the vconcat here rather than reference task1
                     alt.vconcat(task3_alt.properties(spacing=-1), text3, spacing=-3),
                    spacing=-5)

### Final Dashboard 2 Visualization

In [37]:
dashboard2 = alt.vconcat(
    alt.vconcat(dashboard2_text, top, spacing=-25),
    alt.vconcat(bottom, dashboard2_legend, spacing=-5),
    center=False,
    spacing=-20
).resolve_scale(
    color='independent'
).properties(
    title=alt.Title("Billionaire Net Worth's Distributions, Trends, and Correlations to Socioeconomic Factors", fontSize=20)
)

dashboard2

## End of the visualization implementation! Thank you for reading! 🎉