## Import

In [1]:
import numpy as np
import pandas as pd
import altair as alt

alt.renderers.enable('default') # Display while editing

RendererRegistry.enable('default')

In [2]:
# The encoding is "ISO-8859-1" instead of the default "utf-8" because some of the categorical variables
# have symbols that do not appear in the "utf-8" encoding. This would cause errors when reading the file.
youtube_data = pd.read_csv("data/Global YouTube Statistics.csv", encoding = "ISO-8859-1", index_col = "rank")
youtube_data

Unnamed: 0_level_0,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,video_views_rank,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,T-Series,245000000,2.280000e+11,Music,T-Series,20082,India,IN,Music,1.0,...,2000000.0,2006.0,Mar,13.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
2,YouTube Movies,170000000,0.000000e+00,Film & Animation,youtubemovies,1,United States,US,Games,4055159.0,...,,2006.0,Mar,5.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
3,MrBeast,166000000,2.836884e+10,Entertainment,MrBeast,741,United States,US,Entertainment,48.0,...,8000000.0,2012.0,Feb,20.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,2.0,...,1000000.0,2006.0,Sep,1.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
5,SET India,159000000,1.480000e+11,Shows,SET India,116536,India,IN,Entertainment,3.0,...,1000000.0,2006.0,Sep,20.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,Natan por Aï¿,12300000,9.029610e+09,Sports,Natan por Aï¿,1200,Brazil,BR,Entertainment,525.0,...,700000.0,2017.0,Feb,12.0,51.3,2.125594e+08,12.08,183241641.0,-14.235004,-51.925280
992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,Free Fire India Official,1500,India,IN,Games,6141.0,...,300000.0,2018.0,Sep,14.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
993,Panda,12300000,2.214684e+09,,HybridPanda,2452,United Kingdom,GB,Games,129005.0,...,1000.0,2006.0,Sep,11.0,60.0,6.683440e+07,3.85,55908316.0,55.378051,-3.435973
994,RobTopGames,12300000,3.741235e+08,Gaming,RobTopGames,39,Sweden,SE,Games,35112.0,...,100000.0,2012.0,May,9.0,67.0,1.028545e+07,6.48,9021165.0,60.128161,18.643501


## Clean and Wrangle

In [3]:
youtube_data = youtube_data.rename(columns = {
    "Youtuber" : "youtuber",
    "subscribers" : "subscribers",
    "video views" : "video_views",
    "category" : "category",
    "Title" : "title",
    "uploads" : "uploads",
    "Country" : "country",
    "Abbreviation" : "abbreviation",
    "channel_type" : "channel_type",
    "video_views_rank" : "video_views_rank",
    "country_rank" : "country_rank",
    "channel_type_rank" : "channel_type_rank",
    "video_views_for_the_last_30_days" : "video_views_for_the_last_30_days",
    "lowest_monthly_earnings" : "lowest_monthly_earnings",
    "highest_monthly_earnings" : "highest_monthly_earnings",
    "lowest_yearly_earnings" : "lowest_yearly_earnings",
    "highest_yearly_earnings" : "highest_yearly_earnings",
    "subscribers_for_last_30_days" : "subscribers_for_last_30_days",
    "created_year" : "created_year",
    "created_month" : "created_month",
    "created_date" : "created_day",
    "Gross tertiary education enrollment (%)" : "gross_tertiary_education_enrollment",
    "Population" : "population",
    "Unemployment rate" : "unemployment_rate",
    "Urban_population" : "urban_population",
    "Latitude" : "latitude",
    "Longitude" : "longitude"
})
youtube_data.head()

Unnamed: 0_level_0,youtuber,subscribers,video_views,category,title,uploads,country,abbreviation,channel_type,video_views_rank,...,subscribers_for_last_30_days,created_year,created_month,created_day,gross_tertiary_education_enrollment,population,unemployment_rate,urban_population,latitude,longitude
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,1.0,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,4055159.0,...,,2006.0,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,48.0,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,2.0,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,3.0,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288


In [4]:
monthStringToInt = {"Jan" : 1, "Feb" : 2,  "Mar" : 3,  "Apr" : 4,
                    "May" : 5, "Jun" : 6,  "Jul" : 7,  "Aug" : 8,
                    "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12}

# Create year, month, and day columns
youtube_data["year"] = youtube_data["created_year"]
youtube_data["month"] = youtube_data["created_month"].map( # Turn months into integers by mapping month strings to integer values
    monthStringToInt
)
youtube_data["day"] = youtube_data["created_day"]

# Use year, month, and day columns to make a single created_date column of type datetime64[ns]
youtube_data["created_date"] = pd.to_datetime(
    youtube_data[["year", "month", "day"]]
)

# Remove created_year, created_month, created_day, year, month, and day columns
youtube_data = youtube_data[["youtuber", "subscribers", "video_views", "category", "title",
    "uploads", "country", "abbreviation", "channel_type",
    "video_views_rank", "country_rank", "channel_type_rank",
    "video_views_for_the_last_30_days", "lowest_monthly_earnings",
    "highest_monthly_earnings", "lowest_yearly_earnings",
    "highest_yearly_earnings", "subscribers_for_last_30_days",
    "gross_tertiary_education_enrollment", "population",
    "unemployment_rate", "urban_population", "latitude", "longitude", "created_date"]]

In [5]:
# Set percentages of gross_tertiary_education_enrollment that are above 100 to np.NaN
youtube_data['gross_tertiary_education_enrollment'] = youtube_data['gross_tertiary_education_enrollment'].apply(
    lambda x: x if x <= 100 else np.NaN
)

In [6]:
# Set created_date dates that are before February 14, 2005 to np.NaN
youtube_data["created_date"] = youtube_data['created_date'].apply(
    lambda x: x if x >= pd.Timestamp(year = 2005, month = 2, day = 14) else np.NaN
)

In [7]:
# unemployment_rate can be any value greater than or equal to 0, so the categories
# will be divided by using the maximum value within the dataset.
def map_unemployment_rate(rate):
    rate_peak = youtube_data["unemployment_rate"].max()
    if rate < rate_peak / 3:
        return "Low"
    elif rate < rate_peak * 2 / 3:
        return "Moderate"
    else:
        return "High"

# Meanwhile, gross_tertiary_education_enrollment ranges from 0 to 100, so the
# categories will be divided by using that range instead.
def map_education_enrollment(enrollment):
    if pd.isna(enrollment):
        return "NA"
    elif enrollment < 100 / 3:
        return "Low"
    elif enrollment < 100 * 2 / 3:
        return "Moderate"
    else:
        return "High"

youtube_data["unemployment_rate_ordinal"] = youtube_data["unemployment_rate"].apply(
    map_unemployment_rate
)
youtube_data["education_enrollment_ordinal"] = youtube_data["gross_tertiary_education_enrollment"].apply(
    map_education_enrollment
)

# Print the modified dataset
youtube_data

Unnamed: 0_level_0,youtuber,subscribers,video_views,category,title,uploads,country,abbreviation,channel_type,video_views_rank,...,subscribers_for_last_30_days,gross_tertiary_education_enrollment,population,unemployment_rate,urban_population,latitude,longitude,created_date,unemployment_rate_ordinal,education_enrollment_ordinal
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,T-Series,245000000,2.280000e+11,Music,T-Series,20082,India,IN,Music,1.0,...,2000000.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880,2006-03-13,Moderate,Low
2,YouTube Movies,170000000,0.000000e+00,Film & Animation,youtubemovies,1,United States,US,Games,4055159.0,...,,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891,2006-03-05,High,High
3,MrBeast,166000000,2.836884e+10,Entertainment,MrBeast,741,United States,US,Entertainment,48.0,...,8000000.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891,2012-02-20,High,High
4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,2.0,...,1000000.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891,2006-09-01,High,High
5,SET India,159000000,1.480000e+11,Shows,SET India,116536,India,IN,Entertainment,3.0,...,1000000.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880,2006-09-20,Moderate,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,Natan por Aï¿,12300000,9.029610e+09,Sports,Natan por Aï¿,1200,Brazil,BR,Entertainment,525.0,...,700000.0,51.3,2.125594e+08,12.08,183241641.0,-14.235004,-51.925280,2017-02-12,High,Moderate
992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,Free Fire India Official,1500,India,IN,Games,6141.0,...,300000.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880,2018-09-14,Moderate,Low
993,Panda,12300000,2.214684e+09,,HybridPanda,2452,United Kingdom,GB,Games,129005.0,...,1000.0,60.0,6.683440e+07,3.85,55908316.0,55.378051,-3.435973,2006-09-11,Low,Moderate
994,RobTopGames,12300000,3.741235e+08,Gaming,RobTopGames,39,Sweden,SE,Games,35112.0,...,100000.0,67.0,1.028545e+07,6.48,9021165.0,60.128161,18.643501,2012-05-09,Moderate,High


In [8]:
youtube_data["category"] = youtube_data["category"].fillna("NA")

## Task 1: Characterize Distribution

What are the distributions of highest yearly earnings in countries of various unemployment rates or education enrollment rates?

In [9]:
category_options = list(youtube_data["category"].unique())

category_radio = alt.binding_select(
    options = category_options + [None],
    labels = category_options + ["All"],
    name = "Category:"
)

category_select = alt.selection_point(
    fields = ["category"],
    bind = category_radio
)

brush = alt.selection_interval(encodings = ["x", "y"], empty = True)

In [10]:
histo_chart = alt.Chart(youtube_data).mark_bar().encode(
    alt.X("highest_yearly_earnings", bin = alt.BinParams(maxbins = 20), title = "Highest Yearly Earnings (Binned)"),
    alt.Y("count(highest_yearly_earnings)", title = "Number of Channels"),
    color = alt.Color("education_enrollment_ordinal", title = "Education Enrollment"),
    tooltip = [alt.Tooltip("count(highest_yearly_earnings)", title = "Number of Channels"),
               alt.Tooltip("mean(highest_yearly_earnings)", title = "Mean Highest Yearly Earnings")]
).add_params(
    category_select
).transform_filter(
    category_select
)

histo_chart

In [17]:
scatter_chart = alt.Chart(youtube_data).mark_circle(opacity = 0.25).encode(
    alt.X("subscribers:Q", title = "Number of Subscribers"),
    alt.Y("video_views:Q", title = "Number of Video Views"),
    color = alt.condition(brush, alt.value("blue"), alt.value("lightgray")),
    tooltip = [alt.Tooltip("youtuber", title = "Channel Name: "),
               alt.Tooltip("category", title = "Category: "),
               alt.Tooltip("uploads", title = "Total Uploads: "),
               alt.Tooltip("video_views", title = "Total Video Views: "),
               alt.Tooltip("subscribers", title = "Total Subscribers: ")]
).add_params(
    brush
)

scatter_chart

In [12]:
scatter_chart.transform_filter(category_select) | histo_chart.transform_filter(brush)

- Consider repeating a histogram like this for more different numerical variables
- Consider adding more encodings and/or changing the mark from a bar to some other binned variable

## Problem recreation

In [13]:
ordinal_options = ["Low", "Moderate", "High"]
enrolment_labels = ["0% - 33.3%", "33.3% - 66.7", "66.8% - 100%"]
unemployment_labels = ["Bottom third of countries in dataset", "Middle third of countries in dataset", "Top third of countries in dataset"]

unemployment_radio = alt.binding_radio(
    options = ordinal_options + [None],
    labels = unemployment_labels + ["All"],
    name = "Unemployment Rate:"
)

unemployment_select = alt.selection_point(
    fields = ["unemployment_rate_ordinal"],
    bind = unemployment_radio
)

education_radio = alt.binding_radio(
    options = ordinal_options + [None],
    labels = enrolment_labels + ["All"],
    name = "Education Enrollment:"
)

education_select = alt.selection_point(
    fields = ["education_enrollment_ordinal"],
    bind = education_radio
)

brush = alt.selection_interval(encodings = ["x", "y"], empty = True)

In [14]:
old_histo_chart = alt.Chart(youtube_data).mark_bar().encode(
    alt.X("highest_yearly_earnings", bin = alt.BinParams(maxbins = 20), title = "Highest Yearly Earnings (Binned)"),
    alt.Y("count(highest_yearly_earnings)", title = "Number of Channels"),
    tooltip = [alt.Tooltip("count(highest_yearly_earnings)", title = "Number of Channels")]
).add_params(
    education_select,
    unemployment_select
).transform_filter(
    education_select
).transform_filter(
    unemployment_select
)

old_histo_chart

In [15]:
old_scatter_chart = alt.Chart(youtube_data).mark_circle(opacity = 0.5).encode(
    alt.X("subscribers:Q", title = "Number of Subscribers"),
    alt.Y("video_views:Q", title = "Number of Video Views"),
    alt.Color("category", title = "Category"),
    tooltip = [alt.Tooltip("youtuber", title = "Channel Name: "),
               alt.Tooltip("category", title = "Category: "),
               alt.Tooltip("video_views", title = "Total Video Views: "),
               alt.Tooltip("subscribers", title = "Total Subscribers: ")]
).add_params(
    brush
)

old_scatter_chart

In [16]:
old_scatter_chart.transform_filter(education_select).transform_filter(unemployment_select) | old_histo_chart.transform_filter(brush)