In [2]:
import numpy as np
import pandas as pd
import altair as alt
import pycountry
from vega_datasets import data
from country_converter import convert

alt.renderers.enable('default')

RendererRegistry.enable('default')

In [3]:
youtube_data = pd.read_csv("data/Global YouTube Statistics.csv", encoding = "ISO-8859-1", index_col = "rank")
youtube_data = youtube_data.rename(columns = {
    "Youtuber" : "youtuber",
    "subscribers" : "subscribers",
    "video views" : "video_views",
    "category" : "category",
    "Title" : "title",
    "uploads" : "uploads",
    "Country" : "Country",
    "Abbreviation" : "abbreviation",
    "channel_type" : "channel_type",
    "video_views_rank" : "video_views_rank",
    "country_rank" : "country_rank",
    "channel_type_rank" : "channel_type_rank",
    "video_views_for_the_last_30_days" : "video_views_for_the_last_30_days",
    "lowest_monthly_earnings" : "lowest_monthly_earnings",
    "highest_monthly_earnings" : "highest_monthly_earnings",
    "lowest_yearly_earnings" : "lowest_yearly_earnings",
    "highest_yearly_earnings" : "highest_yearly_earnings",
    "subscribers_for_last_30_days" : "subscribers_for_last_30_days",
    "created_year" : "created_year",
    "created_month" : "created_month",
    "created_date" : "created_day",
    "Gross tertiary education enrollment (%)" : "gross_tertiary_education_enrollment",
    "Population" : "population",
    "Unemployment rate" : "unemployment_rate",
    "Urban_population" : "urban_population",
    "Latitude" : "latitude",
    "Longitude" : "longitude"
})
monthStringToInt = {"Jan" : 1, "Feb" : 2,  "Mar" : 3,  "Apr" : 4,
                    "May" : 5, "Jun" : 6,  "Jul" : 7,  "Aug" : 8,
                    "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12}

# Create year, month, and day columns
youtube_data["year"] = youtube_data["created_year"]
youtube_data["month"] = youtube_data["created_month"].map( # Turn months into integers by mapping month strings to integer values
    monthStringToInt
)
youtube_data["day"] = youtube_data["created_day"]

# Use year, month, and day columns to make a single created_date column of type datetime64[ns]
youtube_data["created_date"] = pd.to_datetime(
    youtube_data[["year", "month", "day"]]
)

# Remove created_year, created_month, created_day, year, month, and day columns
youtube_data = youtube_data[["youtuber", "subscribers", "video_views", "category", "title",
    "uploads", "Country", "abbreviation", "channel_type",
    "video_views_rank", "country_rank", "channel_type_rank",
    "video_views_for_the_last_30_days", "lowest_monthly_earnings",
    "highest_monthly_earnings", "lowest_yearly_earnings",
    "highest_yearly_earnings", "subscribers_for_last_30_days",
    "gross_tertiary_education_enrollment", "population",
    "unemployment_rate", "urban_population", "latitude", "longitude", "created_date"]]

youtube_data['gross_tertiary_education_enrollment'] = youtube_data['gross_tertiary_education_enrollment'].apply(
    lambda x: x if x <= 100 else np.NaN
)

youtube_data["created_date"] = youtube_data['created_date'].apply(
    lambda x: x if x >= pd.Timestamp(year = 2005, month = 2, day = 14) else np.NaN
)

# unemployment_rate can be any value greater than or equal to 0, so the categories
# will be divided by using the maximum value within the dataset.
def map_unemployment_rate(rate):
    rate_peak = youtube_data["unemployment_rate"].max()
    if rate < rate_peak / 3:
        return "Low"
    elif rate < rate_peak * 2 / 3:
        return "Moderate"
    else:
        return "High"

# Meanwhile, gross_tertiary_education_enrollment ranges from 0 to 100, so the
# categories will be divided by using that range instead.
def map_education_enrollment(enrollment):
    if pd.isna(enrollment):
        return "NA"
    elif enrollment < 100 / 3:
        return "Low"
    elif enrollment < 100 * 2 / 3:
        return "Moderate"
    else:
        return "High"

youtube_data["unemployment_rate_ordinal"] = youtube_data["unemployment_rate"].apply(
    map_unemployment_rate
)
youtube_data["education_enrollment_ordinal"] = youtube_data["gross_tertiary_education_enrollment"].apply(
    map_education_enrollment
)

# Print the modified dataset
youtube_data.head()

Unnamed: 0_level_0,youtuber,subscribers,video_views,category,title,uploads,Country,abbreviation,channel_type,video_views_rank,...,subscribers_for_last_30_days,gross_tertiary_education_enrollment,population,unemployment_rate,urban_population,latitude,longitude,created_date,unemployment_rate_ordinal,education_enrollment_ordinal
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,1.0,...,2000000.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-03-13,Moderate,Low
2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,4055159.0,...,,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2006-03-05,High,High
3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,48.0,...,8000000.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2012-02-20,High,High
4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,2.0,...,1000000.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891,2006-09-01,High,High
5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,3.0,...,1000000.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-09-20,Moderate,Low


In [4]:
country_ids = pd.read_csv('https://raw.githubusercontent.com/joelostblom/teaching-datasets/main/country-ids-and-continents.csv')
relevant_country_names = country_ids["Country"]
country_ids.head()

Unnamed: 0,ID,Country,Continent
0,4,Afghanistan,Asia
1,8,Albania,Europe
2,12,Algeria,Africa
3,24,Angola,Africa
4,10,Antarctica,Antarctica


In [5]:
youtube_data = youtube_data.merge(
    country_ids
)

### 1. Geographical Distribution of Top YouTubers:

In [6]:
youtube_data.head()

Unnamed: 0,youtuber,subscribers,video_views,category,title,uploads,Country,abbreviation,channel_type,video_views_rank,...,population,unemployment_rate,urban_population,latitude,longitude,created_date,unemployment_rate_ordinal,education_enrollment_ordinal,ID,Continent
0,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,1.0,...,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-03-13,Moderate,Low,356,Asia
1,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,3.0,...,1366418000.0,5.36,471031528.0,20.593684,78.96288,2006-09-20,Moderate,Low,356,Asia
2,Zee Music Company,96700000,57856290000.0,Music,Zee Music Company,8548,India,IN,Music,12.0,...,1366418000.0,5.36,471031528.0,20.593684,78.96288,2014-03-12,Moderate,Low,356,Asia
3,Sony SAB,83000000,101000000000.0,Shows,Sony SAB,71270,India,IN,Entertainment,4.0,...,1366418000.0,5.36,471031528.0,20.593684,78.96288,2007-08-04,Moderate,Low,356,Asia
4,Zee TV,70500000,73139050000.0,Entertainment,Zee TV,129204,India,IN,Entertainment,9.0,...,1366418000.0,5.36,471031528.0,20.593684,78.96288,2005-12-11,Moderate,Low,356,Asia


In [7]:
# Aggregate subscribers, video_views, and youtuber count for each country
country_stats = youtube_data.groupby(['Country', 'ID']).agg({
    'subscribers': 'sum',
    'video_views': 'sum',
    'youtuber': 'count'
}).reset_index()

country_stats.head()

Unnamed: 0,Country,ID,subscribers,video_views,youtuber
0,Afghanistan,4,20400000,13397000000.0,1
1,Argentina,32,328500000,194415400000.0,13
2,Australia,36,172000000,69141810000.0,9
3,Bangladesh,50,13900000,12129580000.0,1
4,Brazil,76,1221800000,481209000000.0,62


In [8]:
world = alt.topo_feature(data.world_110m.url, 'countries')

# Create the chart
chart_1 = alt.Chart(world).mark_geoshape().encode(
    alt.Color('subscribers:Q',
            scale=alt.Scale(type='log')),
    tooltip=['Country:N', 'subscribers:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(data=country_stats, key='ID', fields=['Country', 'subscribers', 'video_views', 'youtuber', 'channel_type'])
).project(
    'equalEarth'
).properties(
    width=900,
    height=500
)

chart_1

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [9]:
chart_1_mod = alt.Chart(country_stats).mark_geoshape(stroke='black').encode(
    alt.Color('subscribers:Q',
            scale=alt.Scale(type='log')),
    tooltip=['Country:N', 'subscribers:Q']
).transform_lookup(
    lookup='ID',
    from_=alt.LookupData(data=world, key='id', fields=['geometry', 'type'])
).project(
    'equalEarth'
).properties(
    width=900,
    height=500
)

chart_1_mod

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [10]:
# Aggregate subscribers, video_views, and youtuber count for each country
country_stats_channel = youtube_data.groupby(['Country', 'ID', 'channel_type']).agg({
    'subscribers': 'sum',
    'video_views': 'sum',
    'youtuber': 'count'
}).reset_index()

country_stats_channel.head()

Unnamed: 0,Country,ID,channel_type,subscribers,video_views,youtuber
0,Afghanistan,4,Games,20400000,13397000000.0,1
1,Argentina,32,Comedy,15100000,9477021000.0,1
2,Argentina,32,Education,17200000,11445490000.0,1
3,Argentina,32,Entertainment,67700000,24697440000.0,3
4,Argentina,32,Games,65000000,17890850000.0,3


In [11]:
channel_dropdown = alt.binding_select(
    options=list(country_stats_channel['channel_type'].unique()),
    name='Channel Type: ')

selector = alt.selection_multi(fields=['channel_type'], bind=channel_dropdown)

color = alt.condition(
    selector,
    alt.Color('subscribers:Q', scale=alt.Scale(type='log')),
    alt.value('lightgrey')
)

base = alt.Chart(world).mark_geoshape(fill='lightgrey', stroke='black', strokeWidth=0.5)


filtered_chart_1 = alt.Chart(country_stats_channel).mark_geoshape(stroke='black').encode(
    color=color,
    tooltip=['Country:N', 'subscribers:Q']
).transform_lookup(
    lookup='ID',
    from_=alt.LookupData(world, key='id', fields=['geometry', 'type'])
).project(
    'equalEarth'
).properties(
    width=900,
    height=500,
    title='Subscribers in each country for different channels'
).add_selection(
    selector
).transform_filter(
    selector
)

base_chart = (base + filtered_chart_1).configure_view(stroke=None).configure_title(fontSize=20)

base_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [12]:
# Testing Case

# Choose a specific country for investigation (e.g., Argentina)
selected_country = 'Argentina'

# Filter data for the selected country
selected_country_data = country_stats_channel[country_stats_channel['Country'] == selected_country]

# Print the selected country data
print(selected_country_data)


     Country  ID   channel_type  subscribers   video_views  youtuber
1  Argentina  32         Comedy     15100000  9.477021e+09         1
2  Argentina  32      Education     17200000  1.144549e+10         1
3  Argentina  32  Entertainment     67700000  2.469744e+10         3
4  Argentina  32          Games     65000000  1.789085e+10         3
5  Argentina  32          Music    134200000  1.096776e+11         4
6  Argentina  32         People     29300000  2.122695e+10         1


In [33]:
channel_dropdown_1 = alt.binding_select(
    options=['General'] + list(country_stats_channel['channel_type'].unique()),
    name='Channel Type: ')

channel_selector_1 = alt.selection_multi(fields=['channel_type'], bind=channel_dropdown_1)

def generate_chart(channel):
    if channel != 'General':
        chart = filtered_chart_1
    else:
        chart = chart_1_mod.add_selection(
            channel_selector_1
        ).transform_filter(
            channel_selector_1
        )
    return chart

channel_chart_1 = generate_chart(channel_selector_1['channel_type'])

final_chart = (base + channel_chart_1)#.configure_view(stroke=None).configure_title(fontSize=20)

# Display the chart
final_chart


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [54]:
base_1 = alt.Chart(country_stats_channel).encode(
    alt.X('channel_type:N'),
    alt.Y('subscribers:Q', scale=alt.Scale(type='log')),
    color=color,
    tooltip=['Country:N', 'subscribers:Q']
).add_selection(
    selector
).transform_filter(
    selector
)



In [68]:
final_chart_1 = final_chart | base_1.mark_bar(strokeWidth=500).encode(
    x=alt.X('channel_type:N', title='Channel Type'), 
    y=alt.Y('subscribers:Q', stack = 'normalize', sort='x'), 
    color=color
    ).properties(
        title='Top countries with the most subscribers'
    )
final_chart_1

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
