In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
yt = pd.read_csv('youtubers_df.csv')

yt.head()

Unnamed: 0,Rank,Username,Category,Subscribers,Country,Visits,Likes,Comments,Links
0,1,tseries,Música y baile,249500000.0,India,86200.0,2700.0,78.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2,MrBeast,"Videojuegos, Humor",183500000.0,Estados Unidos,117400000.0,5300000.0,18500.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3,CoComelon,Educación,165500000.0,Unknown,7000000.0,24700.0,0.0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,4,SETIndia,,162600000.0,India,15600.0,166.0,9.0,http://youtube.com/channel/UCpEhnqL0y41EpW2TvW...
4,5,KidsDianaShow,"Animación, Juguetes",113500000.0,Unknown,3900000.0,12400.0,0.0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...


In [3]:
countries = pd.read_csv('countries.csv')
countries.head()

Unnamed: 0,Country(Spanish),Country(English),Continent
0,Arabia Saudita,Saudi Arabia,Asia
1,Argelia,Algeria,Africa
2,Argentina,Argentina,South America
3,Bangladesh,Bangladesh,Asia
4,Brasil,Brazil,South America


In [4]:
categories = pd.read_csv('categories.csv', delimiter=';')
categories.head()

Unnamed: 0,Category(Spanish),Category(English)
0,Música y baile,Music and dance
1,"Videojuegos, Humor","Video games, Humor"
2,Educación,Education
3,"Animación, Juguetes","Animation, Toys"
4,"Películas, Videojuegos","Movies, Video games"


In [5]:
def abbrevNum(num):
    sign = +1
    num = int(num)

    if (num < 0):
        sign = -1
        num = sign * num
                            
    if (num < 1e3): return str(sign * num)
    if (num >= 1e3 and num < 1e6): return str(round(sign * (num / 1e3), ndigits=2)) + "K"
    if (num >= 1e6 and num < 1e9): return str(round(sign * (num / 1e6), ndigits=2)) + "M"
    if (num >= 1e9 and num < 1e12): return str(round(sign * (num / 1e9), ndigits=2)) + "B"
    if (num >= 1e12): return str(round(sign * (num / 1e12), ndigits=2)) + "T"

## 1. Clean, Prepare, & Explore the data

### 1.1 Handle Missing Data

In [6]:
# Fill in NaN values in the Category column with 'Unkown'
yt['Category'] = yt['Category'].fillna('Unknown')
yt.head()

Unnamed: 0,Rank,Username,Category,Subscribers,Country,Visits,Likes,Comments,Links
0,1,tseries,Música y baile,249500000.0,India,86200.0,2700.0,78.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,2,MrBeast,"Videojuegos, Humor",183500000.0,Estados Unidos,117400000.0,5300000.0,18500.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,3,CoComelon,Educación,165500000.0,Unknown,7000000.0,24700.0,0.0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,4,SETIndia,Unknown,162600000.0,India,15600.0,166.0,9.0,http://youtube.com/channel/UCpEhnqL0y41EpW2TvW...
4,5,KidsDianaShow,"Animación, Juguetes",113500000.0,Unknown,3900000.0,12400.0,0.0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...


In [16]:
# Remove columns with unknown Category & Country

yt_filled = yt[(yt['Category'] != 'Unknown') & (yt['Country'] != 'Unknown')]
yt_filled.head()

Unnamed: 0,Rank,Username,Subscribers,Visits,Likes,Comments,Links,Category,Country,Continent
0,1,tseries,249500000.0,86200.0,2700.0,78.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...,Music and dance,India,Asia
1,2,MrBeast,183500000.0,117400000.0,5300000.0,18500.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...,"Video games, Humor",United States,North America
5,6,PewDiePie,111500000.0,2400000.0,197300.0,4900.0,http://youtube.com/channel/UC-lHJZR3Gqxm24_Vd_...,"Movies, Video games",United States,North America
8,9,zeemusiccompany,99700000.0,74300.0,2600.0,32.0,http://youtube.com/channel/UCFFbwnve3yF62-tVXk...,Music and dance,India,Asia
9,10,WWE,97200000.0,184500.0,6300.0,214.0,http://youtube.com/channel/UCJ5v_MCY6GNUBTO8-D...,Video games,United States,North America


### 1.2 Translate Spanish Columns to English

In [7]:
# Translate Categories
yt_categories = yt.merge(categories, how='inner', left_on='Category', right_on='Category(Spanish)')

# Translate Countries
yt_clean = yt_categories.merge(countries, how='inner', left_on='Country', right_on='Country(Spanish)')

In [8]:
yt = yt_clean[['Rank', 'Username', 'Subscribers', 'Visits', 'Likes', 'Comments', 'Links', 'Category(English)', 'Country(English)', 'Continent']]

In [None]:
# Rename the column names for consistency
yt.rename(columns = {'Category(English)':'Category', 'Country(English)':'Country'}, inplace = True)

In [10]:
yt.head()

Unnamed: 0,Rank,Username,Subscribers,Visits,Likes,Comments,Links,Category,Country,Continent
0,1,tseries,249500000.0,86200.0,2700.0,78.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...,Music and dance,India,Asia
1,2,MrBeast,183500000.0,117400000.0,5300000.0,18500.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...,"Video games, Humor",United States,North America
2,3,CoComelon,165500000.0,7000000.0,24700.0,0.0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...,Education,Unknown,Unknown
3,4,SETIndia,162600000.0,15600.0,166.0,9.0,http://youtube.com/channel/UCpEhnqL0y41EpW2TvW...,Unknown,India,Asia
4,5,KidsDianaShow,113500000.0,3900000.0,12400.0,0.0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...,"Animation, Toys",Unknown,Unknown


## 2. Trend Analysis

### 2.1 Most Popular Categories

In [17]:
popular_categories = yt_filled[['Category', 'Subscribers']].groupby('Category').sum().sort_values(by='Subscribers', ascending=False)[:10]


In [18]:
popular_categories['Subscribers'] = popular_categories['Subscribers'].apply(abbrevNum)
popular_categories.reset_index()

Unnamed: 0,Category,Subscribers
0,Music and dance,3.84B
1,"Movies, Animation",1.16B
2,"Music and dance, Movies",708.7M
3,News and Politics,676.1M
4,"Animation, Video games",659.4M
5,Daily vlogs,535.1M
6,"Movies, Humor",509.1M
7,"Animation, Humor",498.7M
8,Movies,489.2M
9,"Video games, Humor",489.0M


### 2.2 Correlation Between Subscribers & Likes and Comments

In [95]:
px.scatter(yt, title='YT Subscribers vs Likes', x='Subscribers', y='Likes', trendline='ols', trendline_color_override="red")

In [94]:
px.scatter(yt, title='YT Subscribers vs Comments', x='Subscribers', y='Comments', trendline='ols', trendline_color_override="red")

## 3. Audience Study

### 3.1 Regional Prefence for Content

In [19]:
regional_popularity = yt_filled[['Category', 'Subscribers', 'Continent']].groupby(['Continent', 'Category']).sum().reset_index()

In [20]:
# regional_popularity['Subscribers'] = regional_popularity['Subscribers'].apply(abbrevNum)
regional_popularity = regional_popularity.sort_values(by=['Continent', 'Subscribers'], ascending=[True, False])

In [21]:
regional_popularity_clean = regional_popularity
regional_popularity_clean['Subscribers'] = regional_popularity_clean['Subscribers'].apply(abbrevNum)

In [22]:
regional_popularity_clean

Unnamed: 0,Continent,Category,Subscribers
1,Africa,Movies,27.5M
3,Africa,News and Politics,25.8M
2,Africa,"Music and dance, Movies",15.2M
0,Africa,Education,12.2M
14,Asia,Music and dance,1.77B
...,...,...,...
84,South America,"Music and dance, Movies",61.2M
77,South America,Daily vlogs,59.0M
85,South America,Science and technology,39.3M
81,South America,"Movies, Video games",20.1M


## 5. Content Categories

### 5.1 Categories With Most Visits

In [40]:
top_categories = yt_filled[['Category', 'Visits']].groupby(by='Category').sum().sort_values(by='Visits', ascending=False)[:10].reset_index()
# top_categories['Visits'] = top_categories['Visits'].apply(abbrevNum)
top_categories

Unnamed: 0,Category,Visits
0,"Video games, Humor",174074500.0
1,Daily vlogs,124394700.0
2,"Animation, Humor",101344000.0
3,Music and dance,56720000.0
4,"Animation, Video games",40802000.0
5,Food and drink,32184800.0
6,"Movies, Humor",29574300.0
7,"Movies, Animation",29210200.0
8,Video games,26355600.0
9,Humor,23104000.0


In [34]:
px.bar(top_categories, x='Category', y='Visits')

### 5.2 Categories With Most Likes

In [35]:
category_most_likes = yt_filled[['Category', 'Likes']].groupby(by='Category').sum().sort_values(by='Likes', ascending=False)[:10].reset_index()
category_most_likes

Unnamed: 0,Category,Likes
0,"Video games, Humor",7148700.0
1,Daily vlogs,6916130.0
2,"Animation, Humor",3932460.0
3,Music and dance,2752369.0
4,"Animation, Video games",2695997.0
5,Humor,1699900.0
6,Food and drink,1542077.0
7,"Movies, Animation",1535496.0
8,"Movies, Humor",1347527.0
9,Video games,1085300.0


In [36]:
px.bar(category_most_likes, x='Category', y='Likes')

### 5.3 Categories With Most Comments

In [37]:
category_most_comments = yt_filled[['Category', 'Comments']].groupby(by='Category').sum().sort_values(by='Comments', ascending=False)[:10].reset_index()
category_most_comments

Unnamed: 0,Category,Comments
0,Music and dance,319554.0
1,"Animation, Humor",144040.0
2,"Animation, Video games",128745.0
3,"Video games, Humor",82060.0
4,Humor,51598.0
5,"Movies, Animation",39190.0
6,Education,36894.0
7,Food and drink,36641.0
8,Daily vlogs,36252.0
9,"Movies, Humor",34082.0


In [38]:
px.bar(category_most_comments, x='Category', y='Comments')

## 6. Benchmarking

### 6.1 Streamer With Most Subscribers

In [42]:
streamer_most_subscribers = yt[['Username', 'Subscribers',]].groupby(by='Username').sum().sort_values(by='Subscribers', ascending=False)[:10].reset_index()
streamer_most_subscribers

Unnamed: 0,Username,Subscribers
0,tseries,249500000.0
1,MrBeast,183500000.0
2,CoComelon,165500000.0
3,SETIndia,162600000.0
4,KidsDianaShow,113500000.0
5,PewDiePie,111500000.0
6,LikeNastyaofficial,107500000.0
7,VladandNiki,101400000.0
8,zeemusiccompany,99700000.0
9,WWE,97200000.0


In [43]:
px.bar(streamer_most_subscribers, x='Username', y='Subscribers')

### 6.2 Streamer With Most Visits

In [44]:
streamer_most_visits = yt[['Username', 'Visits',]].groupby(by='Username').sum().sort_values(by='Visits', ascending=False)[:10].reset_index()
streamer_most_visits

Unnamed: 0,Username,Visits
0,MrBeast,117400000.0
1,MrBeast2,83100000.0
2,DaFuqBoom,52700000.0
3,VillageCookingChannel,21500000.0
4,BeastPhilanthropy,21500000.0
5,jaanvipatel,19100000.0
6,_vector_,15400000.0
7,dojacat,13600000.0
8,alfredolarin,12900000.0
9,NickPro,12200000.0


In [45]:
px.bar(streamer_most_visits, x='Username', y='Visits')

### 6.2 Streamer With Most Likes

In [47]:
streamer_most_likes = yt[['Username', 'Likes',]].groupby(by='Username').sum().sort_values(by='Likes', ascending=False)[:10].reset_index()
streamer_most_likes

Unnamed: 0,Username,Likes
0,MrBeast,5300000.0
1,MrBeast2,5000000.0
2,DaFuqBoom,1700000.0
3,triggeredinsaan,1400000.0
4,Mythpat,1200000.0
5,BeastPhilanthropy,952100.0
6,_vector_,712400.0
7,alfredolarin,707600.0
8,MRINDIANHACKER,617400.0
9,alanbecker,582600.0


In [48]:
px.bar(streamer_most_likes, x='Username', y='Likes')

### 6.2 Streamer With Most Comments

In [49]:
streamer_most_comments = yt[['Username', 'Comments',]].groupby(by='Username').sum().sort_values(by='Comments', ascending=False)[:10].reset_index()
streamer_most_comments

Unnamed: 0,Username,Comments
0,BispoBrunoLeonardo,154000.0
1,DaFuqBoom,82800.0
2,dojacat,73000.0
3,DanTDM,52500.0
4,triggeredinsaan,38000.0
5,MRINDIANHACKER,26000.0
6,BeastPhilanthropy,24000.0
7,A4a4a4a4,22000.0
8,dream,19000.0
9,MrBeast,18500.0


In [50]:
px.bar(streamer_most_comments, x='Username', y='Comments')