In [1]:
import pandas as pd

# Reading the dataset again after the reset
file_path = 'tidy_youtube_data.csv'
youtube_data = pd.read_csv(file_path, encoding="ISO-8859-1", index_col="rank")
youtube_data = youtube_data.rename(columns = {"country" : "Country"})

# Displaying the first few rows of the dataset
youtube_data.head()

Unnamed: 0_level_0,youtuber,subscribers,video_views,category,title,uploads,Country,abbreviation,channel_type,video_views_rank,...,subscribers_for_last_30_days,gross_tertiary_education_enrollment,population,unemployment_rate,urban_population,latitude,longitude,created_date,unemployment_rate_ordinal,education_enrollment_ordinal
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,1.0,...,2000000.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-03-13,Moderate,Low
2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,4055159.0,...,,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2006-03-05,High,High
3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,48.0,...,8000000.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2012-02-20,High,High
4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,2.0,...,1000000.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2006-09-01,High,High
5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,3.0,...,1000000.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-09-20,Moderate,Low


In [2]:
country_ids = pd.read_csv('https://raw.githubusercontent.com/joelostblom/teaching-datasets/main/country-ids-and-continents.csv')
relevant_country_names = country_ids["Country"]
country_ids.head()

Unnamed: 0,ID,Country,Continent
0,4,Afghanistan,Asia
1,8,Albania,Europe
2,12,Algeria,Africa
3,24,Angola,Africa
4,10,Antarctica,Antarctica


In [3]:
youtube_data = youtube_data.merge(
    country_ids
)

In [4]:
# Filter the dataset to get the top 15 YouTube channels ranked by subscribers
top_channels = youtube_data.dropna().nlargest(15, 'subscribers')
top_channels.reset_index(drop=True, inplace = True)
top_channels.index = top_channels.index + 1  
# Display relevant columns for the top channels
top_channels[['youtuber', 'Country', 'population', 'urban_population', 'gross_tertiary_education_enrollment', 'subscribers', 'video_views']]


Unnamed: 0,youtuber,Country,population,urban_population,gross_tertiary_education_enrollment,subscribers,video_views
1,T-Series,India,1366418000.0,471031528.0,28.1,245000000,228000000000.0
2,MrBeast,United States,328239500.0,270663028.0,88.2,166000000,28368840000.0
3,Cocomelon - Nursery Rhymes,United States,328239500.0,270663028.0,88.2,162000000,164000000000.0
4,SET India,India,1366418000.0,471031528.0,28.1,159000000,148000000000.0
5,Like Nastya,Russia,144373500.0,107683889.0,81.9,106000000,90479060000.0
6,Vlad and Niki,United States,328239500.0,270663028.0,88.2,98900000,77180170000.0
7,Zee Music Company,India,1366418000.0,471031528.0,28.1,96700000,57856290000.0
8,WWE,United States,328239500.0,270663028.0,88.2,96000000,77428470000.0
9,BLACKPINK,South Korea,51709100.0,42106719.0,94.3,89800000,32144600000.0
10,Sony SAB,India,1366418000.0,471031528.0,28.1,83000000,101000000000.0


In [5]:
import altair as alt
# For the dropdown menu, we'll need a list of countries in the top_channels DataFrame
countries = top_channels['Country'].unique().tolist()

# Create the selection for the dropdown menu
select_country = alt.selection_single(
    name='Select', fields=['Country'],
    bind=alt.binding_select(options=countries)
)

# Plot A: Subscribers Visualization
plot_a = alt.Chart(top_channels).mark_bar().encode(
    y=alt.Y('youtuber:N', title='YouTube Channel', sort='-x'),
    x=alt.X('subscribers:Q', title='Number of Subscribers'),
    color=alt.condition(select_country, 'Country:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'subscribers', 'Country']
).properties(
    width=600,
    height=300,
    title='Top 15 YouTube Channels by Subscribers'
).add_selection(select_country)

# Plot B: Video Views Visualization
plot_b = alt.Chart(top_channels).mark_bar().encode(
    y=alt.Y('youtuber:N', title='YouTube Channel', sort='-x'),
    x=alt.X('video_views:Q', title='Video Views'),
    color=alt.condition(select_country, 'Country:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'video_views', 'Country']
).properties(
    width=600,
    height=300,
    title='Top 15 YouTube Channels by Video Views'
).add_selection(select_country)

combined_plots = alt.vconcat(plot_a, plot_b)

# Display the combined plots
combined_plots



In [6]:
from vega_datasets import data
# Load the world map data
world = alt.topo_feature(data.world_110m.url, 'countries')

# Aggregate country statistics
country_stats = top_channels.groupby(['Country', 'ID']).agg({
    'population': 'mean',
    'urban_population': 'mean',
    'gross_tertiary_education_enrollment': 'mean',
    'subscribers': 'sum',
    'video_views': 'sum'
}).reset_index()


# Create the map visualization
map_chart = alt.Chart(world).mark_geoshape().encode(
    color=alt.Color( 'Country:N'),
    tooltip=[
        alt.Tooltip('Country:N', title='Country'),
        alt.Tooltip('population:Q', title='Population'),
        alt.Tooltip('urban_population:Q', title='Urban Population'),
        alt.Tooltip('gross_tertiary_education_enrollment:Q', title='Gross Tertiary Education Enrollment'),
        alt.Tooltip('subscribers:Q', title='Subscribers'),
        alt.Tooltip('video_views:Q', title='Video Views')
    ]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=country_stats, key='ID', fields=[
        'Country',
        'population',
        'urban_population',
        'gross_tertiary_education_enrollment',
        'subscribers',
        'video_views'
    ])
).project(
    type='equalEarth'
).properties(
    width=800,
    height=400,
    title='YouTube Channel Statistics by Country'
)

map_chart

In [7]:
# Create the selection for the dropdown menu
select_country = alt.selection_single(
    name='Select', fields=['Country'],
    bind=alt.binding_select(options=countries)
)

# Modify Plot A to include the dropdown menu selection
plot_a = alt.Chart(top_channels).mark_bar().encode(
    y=alt.Y('youtuber:N', title='YouTube Channel', sort='-x'),
    x=alt.X('subscribers:Q', title='Number of Subscribers'),
    color=alt.condition(select_country, 'Country:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'subscribers', 'Country']
).properties(
    width=200,
    height=200,
    title='Top 15 YouTube Channels by Subscribers'
).add_selection(select_country)

# Modify Plot B to include the dropdown menu selection
plot_b = alt.Chart(top_channels).mark_bar().encode(
    y=alt.Y('youtuber:N', title='YouTube Channel', sort='-x'),
    x=alt.X('video_views:Q', title='Video Views'),
    color=alt.condition(select_country, 'Country:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'video_views', 'Country']
).properties(
    width=200,
    height=200,
    title='Top 15 YouTube Channels by Video Views'
).add_selection(select_country)

# Modify the map chart to include the dropdown menu selection
map_chart = alt.Chart(world).mark_geoshape().encode(
    color=alt.condition(select_country, 'Country:N', alt.value('lightgray'), legend=None),
    tooltip=[
        alt.Tooltip('Country:N', title='Country'),
        alt.Tooltip('population:Q', title='Population'),
        alt.Tooltip('urban_population:Q', title='Urban Population'),
        alt.Tooltip('gross_tertiary_education_enrollment:Q', title='Gross Tertiary Education Enrollment'),
        alt.Tooltip('subscribers:Q', title='Subscribers'),
        alt.Tooltip('video_views:Q', title='Video Views')
    ]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=country_stats, key='ID', fields=[
        'Country',
        'population',
        'urban_population',
        'gross_tertiary_education_enrollment',
        'subscribers',
        'video_views'
    ])
).project(
    type='equalEarth'
).properties(
    width=600,
    height=400,
    title='YouTube Channel Statistics by Country'
).add_selection(select_country)

# Combine the plots and map into a single dashboard
bar_charts = alt.vconcat(
    plot_a,
    plot_b
)

dashboard = alt.hconcat(
    bar_charts,
    map_chart
)

# Display the dashboard
dashboard




In [8]:
top_channels[['youtuber', 'Country', 'channel_type', 'subscribers', 'video_views']]

Unnamed: 0,youtuber,Country,channel_type,subscribers,video_views
1,T-Series,India,Music,245000000,228000000000.0
2,MrBeast,United States,Entertainment,166000000,28368840000.0
3,Cocomelon - Nursery Rhymes,United States,Education,162000000,164000000000.0
4,SET India,India,Entertainment,159000000,148000000000.0
5,Like Nastya,Russia,People,106000000,90479060000.0
6,Vlad and Niki,United States,Entertainment,98900000,77180170000.0
7,Zee Music Company,India,Music,96700000,57856290000.0
8,WWE,United States,Sports,96000000,77428470000.0
9,BLACKPINK,South Korea,Music,89800000,32144600000.0
10,Sony SAB,India,Entertainment,83000000,101000000000.0


In [9]:
select_channel_type = alt.selection_single(
    name='Select', fields=['channel_type'],
    bind=alt.binding_select(options=top_channels['channel_type'].unique().tolist())
)

# Plot for Subscribers (positive X-axis)
plot_subscribers = alt.Chart(top_channels).mark_bar().encode(
    y=alt.Y('youtuber:N', title='YouTube Channel'),
    x=alt.X('subscribers:Q', title='Number of Subscribers'),
    color=alt.condition(select_channel_type, 'channel_type:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'subscribers', 'channel_type:N']
).add_selection(
    select_country
)

# Plot for Video Views (negative X-axis)
plot_views = alt.Chart(top_channels).mark_bar().encode(
    y=alt.Y('youtuber:N', axis=None),
    x=alt.X('video_views:Q', title='Video Views', scale=alt.Scale(reverse=True, zero=False)),
    color=alt.condition(select_channel_type, 'channel_type:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'subscribers', 'channel_type:N']
).add_selection(
    select_country
)

# Combining the plots with independent x-axis scales
combined_plots = alt.layer(plot_subscribers, plot_views).resolve_scale(
    y='shared',
    x='independent'  # Each plot keeps its own x-axis scale
).properties(
    width=600,
    height=300
)

# Display the combined plots
combined_plots




In [10]:
select_channel_type = alt.selection_single(
    name='Select', fields=['channel_type'],
    bind=alt.binding_select(options=top_channels['channel_type'].unique().tolist())
)

base = alt.Chart(top_channels).encode(
    y=alt.Y('youtuber:N', axis=None, sort=alt.EncodingSortField(field="subscribers", order='descending')),
    tooltip=['youtuber', 'channel_type'] 
).add_selection(
    select_channel_type 
)

# Plot for Subscribers (positive X-axis)
plot_subscribers = base.mark_bar().encode(
    x=alt.X('subscribers:Q', title='Number of Subscribers'),
    color=alt.condition(select_channel_type, 'channel_type:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'subscribers', 'channel_type'] 
)

# Plot for Video Views (negative X-axis)
plot_views = base.mark_bar().encode(
    x=alt.X('video_views:Q', title='Video Views', scale=alt.Scale(reverse=True)),
    color=alt.condition(select_channel_type, 'channel_type:N', alt.value('lightgray'), legend=None),
    tooltip=['youtuber', 'video_views', 'channel_type']
)

# Text overlay
text = base.mark_text(align='center', baseline='middle').encode(
    text='youtuber:N'
)

# Combining the plots
combined_plot1 = alt.hconcat(
    plot_views, text, plot_subscribers
).resolve_scale(
    y='shared',
    x='independent'
).add_selection(
    select_channel_type
)

combined_plot1
